In [1]:
from datasets import load_dataset
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("Tobi-Bueck/customer-support-tickets")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import transformers
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizerFast
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

In [3]:
#switching to pandas
df = ds['train'].to_pandas()
df.info

#filtering only to english tickets
df_en = df[df['language'] == 'en'].copy()

#there are like 50 different labels and so reducing to top 10. Naming the rest of the lables as 'other'
top_label=df_en['queue'].value_counts().head(10).index.tolist()
df_en['label'] = np.where(df_en['queue'].isin(top_label), df_en['queue'], 'other')

In [4]:
#spliting the dataset 
train, temp = train_test_split(df_en, test_size=0.2, stratify=df_en['label'], random_state=42)
valid, test = train_test_split(temp, test_size=0.5, stratify=temp['label'], random_state=42)

print(f"Train rows: {len(train)}")
print(f"Valid rows: {len(valid)}")
print(f"Test rows:  {len(test)}")

test['text'] = test['subject'].fillna('')+ '\n' + test['body'].fillna('')
valid['text'] = valid['subject'].fillna('')+ '\n' + valid['body'].fillna('')
train['text'] = train['subject'].fillna('')+ '\n' + train['body'].fillna('')

x_train = train['text']
y_train = train['label']

x_val = valid['text']
y_val = valid['label']

x_test = test['text']
y_test = test['label']

Train rows: 22608
Valid rows: 2826
Test rows:  2827


In [5]:
#initialize tokenizer
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

#get unique labels from trining data
unique_labels = sorted(list(set(y_train)))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Found {len(unique_labels)} classes: {unique_labels[:3]}")

#helper functions to tokenize and format
def process_data(texts, labels):
    #convert text series to list
    texts = texts.tolist()
    #map labels to integers
    label_ids = [label2id[label] for label in labels]
    
    #create dataset object
    dataset = Dataset.from_dict({
        'text': texts,
        'label': label_ids
    })
    
    dataset = dataset.map(
        lambda x: tokenizer(x['text'], truncation=True, padding="max_length", max_length=128),
        batched=True
    )
    
    #format for PyTorch 
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset

#process all splits
print("Tokenizing Train Data...")
train_dataset = process_data(x_train, y_train)

print("Tokenizing Validation Data...")
val_dataset = process_data(x_val, y_val)

print("Tokenizing Test Data...")
test_dataset = process_data(x_test, y_test)

print(f"\nReady for Training! Train size: {len(train_dataset)}")

Found 10 classes: ['Billing and Payments', 'Customer Service', 'General Inquiry']
Tokenizing Train Data...


Map: 100%|██████████| 22608/22608 [00:01<00:00, 19708.54 examples/s]


Tokenizing Validation Data...


Map: 100%|██████████| 2826/2826 [00:00<00:00, 19721.35 examples/s]


Tokenizing Test Data...


Map: 100%|██████████| 2827/2827 [00:00<00:00, 20515.46 examples/s]


Ready for Training! Train size: 22608





In [6]:
#defining metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1) #convert probabilities to class IDs
    
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    return {'accuracy': acc, 'f1': f1}

#define training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,             
    weight_decay=0.01,              
    logging_dir='./logs',          
    logging_steps=50,
    eval_strategy="epoch",     
    save_strategy="epoch",           
    load_best_model_at_end=True,     
    metric_for_best_model="f1"       
)

device = torch.device("cuda")
#initialize trainer
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)
model.to(device) 

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print("Start training")
trainer.train()

The following list shows the CCs this version of PyTorch was built for and the hardware CCs it supports:
- 5.0 which supports hardware CC >=5.0,<6.0 except {5.3}
- 6.0 which supports hardware CC >=6.0,<7.0 except {6.2}
- 6.1 which supports hardware CC >=6.1,<7.0 except {6.2}
- 7.0 which supports hardware CC >=7.0,<8.0 except {7.2}
- 7.5 which supports hardware CC >=7.5,<8.0
- 8.0 which supports hardware CC >=8.0,<9.0 except {8.7}
- 8.6 which supports hardware CC >=8.6,<9.0 except {8.7}
- 9.0 which supports hardware CC >=9.0,<10.0
Please follow the instructions at https://pytorch.org/get-started/locally/ to install a PyTorch release that supports one of these CUDA versions: 12.8, 13.0
  _warn_unsupported_code(d, device_cc, code_ccs)
NVIDIA GeForce RTX 5070 Laptop GPU with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90.
If you want to use the NVIDIA GeFo

Start training


AcceleratorError: CUDA error: no kernel image is available for execution on the device
Search for `cudaErrorNoKernelImageForDevice' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
