In [2]:
from google.colab import files
uploaded = files.upload()


Saving ticket.csv to ticket.csv


In [11]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import evaluate

# Load dataset
df = pd.read_csv("ticket.csv")
df["text"] = df["subject"].fillna('') + " " + df["body"].fillna('')
df = df.rename(columns={"type": "label"})

# Encode label names into numbers
label_names = sorted(df["label"].unique())
label2id = {label: i for i, label in enumerate(label_names)}
id2label = {i: label for label, i in label2id.items()}
df["label"] = df["label"].map(label2id)

# Convert to HuggingFace dataset
dataset = Dataset.from_pandas(df[["text", "label"]])

# Load tokenizer and model (light version for faster training)
model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id
)

# Tokenize data
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize_function)
dataset = dataset.train_test_split(test_size=0.2)

# Load evaluation metric
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# Define training arguments
args = TrainingArguments(
    output_dir="./ticket_classifier",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10,
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()
!pip install -q transformers datasets evaluate

config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/28587 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [12]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# ✅ Load and preprocess data
df = pd.read_csv("ticket.csv")
df = df.dropna(subset=["subject", "body", "type"])
df["text"] = df["subject"].fillna("") + " " + df["body"].fillna("")

le = LabelEncoder()
df["label"] = le.fit_transform(df["type"])

# ✅ Tokenization
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

class TicketDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

dataset = TicketDataset(df["text"].tolist(), df["label"].tolist())

# ✅ Split into train/test
train_size = int(0.8 * len(dataset))
train_dataset, test_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# ✅ Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(le.classes_))
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# ✅ Training loop
model.train()
for epoch in range(3):
    print(f"Epoch {epoch + 1}")
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# ✅ Evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, axis=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

print("✅ Accuracy:", accuracy_score(all_labels, all_preds))
print("✅ Classification Report:\n", classification_report(all_labels, all_preds, target_names=le.classes_))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"] = df["subject"].fillna("") + " " + df["body"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = le.fit_transform(df["type"])
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1


100%|██████████| 2475/2475 [15:54<00:00,  2.59it/s]


Epoch 2


100%|██████████| 2475/2475 [15:52<00:00,  2.60it/s]


Epoch 3


100%|██████████| 2475/2475 [15:52<00:00,  2.60it/s]


✅ Accuracy: 0.8246464646464646
✅ Classification Report:
               precision    recall  f1-score   support

      Change       0.98      0.97      0.98       486
    Incident       0.80      0.78      0.79      2023
     Problem       0.59      0.60      0.60      1037
     Request       0.99      0.99      0.99      1404

    accuracy                           0.82      4950
   macro avg       0.84      0.84      0.84      4950
weighted avg       0.83      0.82      0.82      4950

