In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, ClassLabel, Features
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

In [2]:
# --- 1. Load Data and Prepare for the GATEKEEPER Model ---
df = pd.read_parquet('processed_customer_support_data.parquet')

df['combined_text'] = df['Ticket Subject'] + " | " + df['Cleaned_Description']

# Binary label for Gatekeeper
df['gatekeeper_label'] = df['Category'].apply(lambda x: 'Account Access' if x == 'Account Access' else 'Other')

# Prepare HF dataset
class_names_gatekeeper = ['Other', 'Account Access']
label_map_gatekeeper = {label: i for i, label in enumerate(class_names_gatekeeper)}
df['label'] = df['gatekeeper_label'].map(label_map_gatekeeper)

df_gatekeeper = df[['combined_text', 'label', 'gatekeeper_label']].rename(columns={'combined_text': 'text'})

hg_dataset_gatekeeper = Dataset.from_pandas(df_gatekeeper, preserve_index=False)
features_gatekeeper = Features({
    'text': hg_dataset_gatekeeper.features['text'],
    'label': ClassLabel(names=class_names_gatekeeper),
    'gatekeeper_label': hg_dataset_gatekeeper.features['gatekeeper_label']
})
hg_dataset_gatekeeper = hg_dataset_gatekeeper.cast(features_gatekeeper)

Casting the dataset:   0%|          | 0/8469 [00:00<?, ? examples/s]

In [3]:
# --- 2. Tokenize and Split ---
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = hg_dataset_gatekeeper.map(tokenize_function, batched=True).remove_columns(["text", "gatekeeper_label"])
tokenized_datasets.set_format("torch")

# Split into train/val/test
split_dataset = tokenized_datasets.train_test_split(test_size=0.2, stratify_by_column="label")
train_dataset = split_dataset["train"]
eval_test_split = split_dataset["test"].train_test_split(test_size=0.5, stratify_by_column="label")
eval_dataset = eval_test_split["train"]
test_dataset = eval_test_split["test"]

Map:   0%|          | 0/8469 [00:00<?, ? examples/s]

In [4]:

# --- 3. Train the Gatekeeper Model ---
id2label = {i: label for i, label in enumerate(class_names_gatekeeper)}
label2id = {label: i for i, label in enumerate(class_names_gatekeeper)}

model_gatekeeper = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=len(class_names_gatekeeper), id2label=id2label, label2id=label2id
)

training_args_gatekeeper = TrainingArguments(
    output_dir="distilbert-gatekeeper-classifier",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    dataloader_num_workers=0,
    report_to=[]   # disable W&B/MLflow
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

trainer_gatekeeper = Trainer(
    model=model_gatekeeper,
    args=training_args_gatekeeper,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer_gatekeeper = Trainer(


In [6]:
print("Starting training for the GATEKEEPER model...")
trainer_gatekeeper.train()
print("Gatekeeper model training complete.")

# --- 4. Final Evaluation on Test Set ---
print("\nEvaluating the Gatekeeper model...")
predictions = trainer_gatekeeper.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = np.array(test_dataset['label'])

print(f"\nAccuracy: {accuracy_score(y_true, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=class_names_gatekeeper))

# Save the best model checkpoint for later use
best_model_path = trainer_gatekeeper.state.best_model_checkpoint
print(f"\nBest Gatekeeper model saved at: {best_model_path}")
trainer_gatekeeper.save_model("best-gatekeeper-model")
print("Best Gatekeeper model explicitly saved to 'best-gatekeeper-model'")

Starting training for the GATEKEEPER model...




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,6.4e-05,1.0,1.0
2,0.015600,3.3e-05,1.0,1.0




Gatekeeper model training complete.

Evaluating the Gatekeeper model...





Accuracy: 1.0000

Classification Report:
                precision    recall  f1-score   support

         Other       1.00      1.00      1.00       796
Account Access       1.00      1.00      1.00        51

      accuracy                           1.00       847
     macro avg       1.00      1.00      1.00       847
  weighted avg       1.00      1.00      1.00       847


Best Gatekeeper model saved at: distilbert-gatekeeper-classifier/checkpoint-424
Best Gatekeeper model explicitly saved to 'best-gatekeeper-model'
