In [32]:
from datasets import load_dataset, Dataset
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)

In [33]:
ds = load_dataset("Tobi-Bueck/customer-support-tickets")
df = ds["train"].to_pandas()

# keep English only
df = df[df["language"] == "en"].copy()

# reduce to top 10 queues + "other"
top_labels = df["queue"].value_counts().head(10).index.tolist()
df["label"] = np.where(df["queue"].isin(top_labels), df["queue"], "other")


In [34]:
def clean_text(s: str) -> str:
    s = "" if s is None else str(s)
    s = re.sub(r"http\S+|www\.\S+", " <URL> ", s)
    s = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", " <EMAIL> ", s)
    s = re.sub(r"\b\d{6,}\b", " <NUM> ", s)  # long IDs
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["subject"] = df["subject"].fillna("").map(clean_text)
df["body"] = df["body"].fillna("").map(clean_text)
df["text"] = (df["subject"] + " [SEP] " + df["body"]).str.strip()

In [35]:
#label encoding
unique_labels = sorted(df["label"].unique().tolist())
label2id = {l: i for i, l in enumerate(unique_labels)}
id2label = {i: l for l, i in label2id.items()}
df["label_id"] = df["label"].map(label2id)

print("Num classes:", len(unique_labels))
print("Classes:", unique_labels)

Num classes: 10
Classes: ['Billing and Payments', 'Customer Service', 'General Inquiry', 'Human Resources', 'IT Support', 'Product Support', 'Returns and Exchanges', 'Sales and Pre-Sales', 'Service Outages and Maintenance', 'Technical Support']


In [36]:
#stratified splits
train_df, temp_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label_id"]
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42, stratify=temp_df["label_id"]
)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

train_ds = Dataset.from_pandas(train_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))
val_ds   = Dataset.from_pandas(val_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))
test_ds  = Dataset.from_pandas(test_df[["text", "label_id"]].rename(columns={"label_id": "labels"}))

Train: 22608, Val: 2826, Test: 2827


In [37]:
#tokenization
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

MAX_LEN = 256  # try 384/512 

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LEN)

train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
test_ds  = test_ds.map(tokenize_fn, batched=True, remove_columns=["text"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map: 100%|██████████| 22608/22608 [00:01<00:00, 21053.96 examples/s]
Map: 100%|██████████| 2826/2826 [00:00<00:00, 25923.61 examples/s]
Map: 100%|██████████| 2827/2827 [00:00<00:00, 26773.40 examples/s]


In [None]:
# model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
    }

In [41]:
#training args
args = TrainingArguments(
    output_dir="./checkpoints_deberta",
    learning_rate=2e-5,
    per_device_train_batch_size=8,      # can be raised to 16
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    label_smoothing_factor=0.05,        # helps generalization
    logging_steps=50,
    fp16=True,                         
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

  trainer = Trainer(


In [42]:
#training
trainer.train()

print("\nValidation metrics:")
print(trainer.evaluate(val_ds))

print("\nTest metrics:")
test_out = trainer.predict(test_ds)
test_preds = np.argmax(test_out.predictions, axis=1)
test_labels = test_out.label_ids


print(classification_report(test_labels, test_preds, target_names=unique_labels))

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.7113,1.674406,0.426752,0.280854
2,1.6132,1.582731,0.461076,0.3431
3,1.4049,1.451103,0.548125,0.464893
4,1.1133,1.339027,0.612173,0.526732
5,0.8802,1.33984,0.64402,0.575022



Validation metrics:


{'eval_loss': 1.3398398160934448, 'eval_accuracy': 0.6440198159943383, 'eval_macro_f1': 0.575021985714781, 'eval_runtime': 152.4522, 'eval_samples_per_second': 18.537, 'eval_steps_per_second': 1.161, 'epoch': 5.0}

Test metrics:
                                 precision    recall  f1-score   support

           Billing and Payments       0.81      0.84      0.82       290
               Customer Service       0.58      0.56      0.57       427
                General Inquiry       0.48      0.28      0.35        40
                Human Resources       0.84      0.38      0.53        55
                     IT Support       0.61      0.55      0.58       334
                Product Support       0.60      0.63      0.62       531
          Returns and Exchanges       0.65      0.43      0.52       140
            Sales and Pre-Sales       0.60      0.35      0.44        84
Service Outages and Maintenance       0.86      0.66      0.74       111
              Technical Support       0.

In [None]:
#save
save_path = "./models/deberta_support_tickets"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"Saved to: {save_path}")