In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from datasets import Dataset, ClassLabel, Features
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [2]:
# --- 1. Load Data and Prepare for the SPECIALIST Model ---
df_full = pd.read_parquet('processed_customer_support_data.parquet')

# Exclude Account Access
df_specialist_data = df_full[df_full['Category'] != 'Account Access'].copy()

print(f"Original dataset size: {len(df_full)}")
print(f"Specialist dataset size (excluding Account Access): {len(df_specialist_data)}")
print("\nNew class distribution:")
print(df_specialist_data['Category'].value_counts(normalize=True))

Original dataset size: 8469
Specialist dataset size (excluding Account Access): 7960

New class distribution:
Category
Billing            0.600251
Technical Issue    0.207035
General Query      0.192714
Name: proportion, dtype: float64


In [3]:
# Combined text feature
df_specialist_data['text'] = df_specialist_data['Ticket Subject'] + " | " + df_specialist_data['Cleaned_Description']

# --- 2. Hugging Face Dataset ---
class_names_specialist = sorted(df_specialist_data['Category'].unique().tolist())
label_map_specialist = {label: i for i, label in enumerate(class_names_specialist)}
df_specialist_data['label'] = df_specialist_data['Category'].map(label_map_specialist)

df_specialist = df_specialist_data[['text', 'label', 'Category']]

hg_dataset_specialist = Dataset.from_pandas(df_specialist, preserve_index=False)
features_specialist = Features({
    'text': hg_dataset_specialist.features['text'],
    'label': ClassLabel(names=class_names_specialist),
    'Category': hg_dataset_specialist.features['Category']
})
hg_dataset_specialist = hg_dataset_specialist.cast(features_specialist)

Casting the dataset:   0%|          | 0/7960 [00:00<?, ? examples/s]

In [4]:
# --- 3. Tokenize and Split ---
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = hg_dataset_specialist.map(tokenize_function, batched=True).remove_columns(["text", "Category"])
tokenized_datasets.set_format("torch")

split_dataset = tokenized_datasets.train_test_split(test_size=0.2, stratify_by_column="label")
train_dataset = split_dataset["train"]
eval_test_split = split_dataset["test"].train_test_split(test_size=0.5, stratify_by_column="label")
eval_dataset = eval_test_split["train"]
test_dataset = eval_test_split["test"]

Map:   0%|          | 0/7960 [00:00<?, ? examples/s]

In [5]:
# --- 4. Train the Specialist Model ---
id2label = {i: label for i, label in enumerate(class_names_specialist)}
label2id = {label: i for i, label in enumerate(class_names_specialist)}

model_specialist = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=len(class_names_specialist), id2label=id2label, label2id=label2id
)

training_args_specialist = TrainingArguments(
    output_dir="distilbert-specialist-classifier",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    dataloader_num_workers=0,
    report_to=[]
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

trainer_specialist = Trainer(
    model=model_specialist,
    args=training_args_specialist,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer_specialist = Trainer(


In [7]:
print("\nStarting training for the SPECIALIST model...")
trainer_specialist.train()
print("Specialist model training complete.")

# --- 5. Evaluate and Save the Specialist Model ---
print("\nEvaluating the Specialist model...")
predictions = trainer_specialist.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = np.array(test_dataset['label'])

print(f"\nAccuracy: {accuracy_score(y_true, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=class_names_specialist))

trainer_specialist.save_model("best-specialist-model")
print("\nBest Specialist model explicitly saved to 'best-specialist-model'")


Starting training for the SPECIALIST model...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.949436,0.600503,0.450613
2,0.956400,0.952011,0.600503,0.450613
3,0.950900,0.950065,0.600503,0.450613




Specialist model training complete.

Evaluating the Specialist model...





Accuracy: 0.5992

Classification Report:
                 precision    recall  f1-score   support

        Billing       0.60      1.00      0.75       477
  General Query       0.00      0.00      0.00       154
Technical Issue       0.00      0.00      0.00       165

       accuracy                           0.60       796
      macro avg       0.20      0.33      0.25       796
   weighted avg       0.36      0.60      0.45       796


Best Specialist model explicitly saved to 'best-specialist-model'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
