In [None]:
from datasets import load_dataset
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("Tobi-Bueck/customer-support-tickets")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import torch
import transformers
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizerFast
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

In [3]:
#switching to pandas
df = ds['train'].to_pandas()
df.info

#filtering only to english tickets
df_en = df[df['language'] == 'en'].copy()

#there are like 50 different labels and so reducing to top 10. Naming the rest of the lables as 'other'
top_label=df_en['queue'].value_counts().head(10).index.tolist()
df_en['label'] = np.where(df_en['queue'].isin(top_label), df_en['queue'], 'other')

In [4]:
#spliting the dataset 
train, temp = train_test_split(df_en, test_size=0.2, stratify=df_en['label'], random_state=42)
valid, test = train_test_split(temp, test_size=0.5, stratify=temp['label'], random_state=42)

print(f"Train rows: {len(train)}")
print(f"Valid rows: {len(valid)}")
print(f"Test rows:  {len(test)}")

test['text'] = test['subject'].fillna('')+ '\n' + test['body'].fillna('')
valid['text'] = valid['subject'].fillna('')+ '\n' + valid['body'].fillna('')
train['text'] = train['subject'].fillna('')+ '\n' + train['body'].fillna('')

x_train = train['text']
y_train = train['label']

x_val = valid['text']
y_val = valid['label']

x_test = test['text']
y_test = test['label']

Train rows: 22608
Valid rows: 2826
Test rows:  2827


In [6]:
#initialize tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

#get unique labels from trining data
unique_labels = sorted(list(set(y_train)))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Found {len(unique_labels)} classes: {unique_labels[:3]}")

#helper functions to tokenize and format
def process_data(texts, labels):
    #convert text series to list
    texts = texts.tolist()
    #map labels to integers
    label_ids = [label2id[label] for label in labels]
    
    #create dataset object
    dataset = Dataset.from_dict({
        'text': texts,
        'label': label_ids
    })
    
    dataset = dataset.map(
        lambda x: tokenizer(x['text'], truncation=True, padding="max_length", max_length=128),
        batched=True
    )
    
    #format for PyTorch 
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset

#process all splits
print("Tokenizing Train Data...")
train_dataset = process_data(x_train, y_train)

print("Tokenizing Validation Data...")
val_dataset = process_data(x_val, y_val)

print("Tokenizing Test Data...")
test_dataset = process_data(x_test, y_test)

print(f"\nReady for Training! Train size: {len(train_dataset)}")

Found 10 classes: ['Billing and Payments', 'Customer Service', 'General Inquiry']
Tokenizing Train Data...


Map: 100%|██████████| 22608/22608 [00:01<00:00, 15070.32 examples/s]


Tokenizing Validation Data...


Map: 100%|██████████| 2826/2826 [00:00<00:00, 18778.44 examples/s]


Tokenizing Test Data...


Map: 100%|██████████| 2827/2827 [00:00<00:00, 18639.15 examples/s]


Ready for Training! Train size: 22608





In [None]:
import torch
from torch import nn
from transformers import Trainer
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced", 
    classes=np.unique(y_train), 
    y=y_train
)

device = torch.device("cuda")
weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

print(f"Class Weights Calculated: {weights_tensor[:3]}...") 

#different weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        loss_fct = nn.CrossEntropyLoss(weight=weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

# training args
training_args = TrainingArguments(
    output_dir='./results_weighted',
    num_train_epochs=10,             
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,              
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

#initialize model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)
model.to(device)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    return {'accuracy': acc, 'f1': f1}

#initialize trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("Starting Weighted Training")
trainer.train()

Class Weights Calculated: tensor([0.9753, 0.6620, 6.9994], device='cuda:0')...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting Weighted Training (10 Epochs)...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.8761,1.758638,0.337933,0.344685
2,1.3877,1.620335,0.390658,0.403866
3,1.2764,1.484128,0.44126,0.460032
4,0.9789,1.437346,0.464968,0.524875
5,0.6595,1.387214,0.533616,0.577642
6,0.5791,1.391987,0.587757,0.632031
7,0.4177,1.442454,0.626327,0.665753
8,0.3015,1.434248,0.644728,0.677058
9,0.3047,1.472113,0.658174,0.691705
10,0.2446,1.477622,0.670205,0.700828


TrainOutput(global_step=14130, training_loss=0.849076042458763, metrics={'train_runtime': 1250.5511, 'train_samples_per_second': 180.784, 'train_steps_per_second': 11.299, 'total_flos': 7488125540352000.0, 'train_loss': 0.849076042458763, 'epoch': 10.0})

In [None]:
print("Running final evaluation on Test Set...")
test_output = trainer.predict(test_dataset)

# convert raw scores to labels
y_test_preds = np.argmax(test_output.predictions, axis=1)
y_test_labels = test_output.label_ids

print("\nFINAL TEST REPORT")
print(classification_report(y_test_labels, y_test_preds, target_names=unique_labels))

save_path = "../models/transformer"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to {save_path}")

Running final evaluation on Test Set...



FINAL TEST REPORT
                                 precision    recall  f1-score   support

           Billing and Payments       0.88      0.84      0.86       290
               Customer Service       0.62      0.66      0.64       427
                General Inquiry       0.88      0.72      0.79        40
                Human Resources       0.79      0.80      0.79        55
                     IT Support       0.62      0.72      0.67       334
                Product Support       0.64      0.62      0.63       531
          Returns and Exchanges       0.76      0.69      0.73       140
            Sales and Pre-Sales       0.76      0.63      0.69        84
Service Outages and Maintenance       0.84      0.87      0.85       111
              Technical Support       0.71      0.68      0.70       815

                       accuracy                           0.70      2827
                      macro avg       0.75      0.73      0.74      2827
                   weighted av