In [14]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader  # <-- FIX IS HERE
from torch.optim import AdamW                   # <-- AND HERE
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup          # <-- AND HERE
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

# --- Load and Process Data ---
df = pd.read_csv("exam_queries.csv")

# --- Encode Labels ---
le = LabelEncoder()
df["label"] = le.fit_transform(df["intent"])

# **CRITICAL FIX**: Save the correctly ordered label names
label_names = list(le.classes_)
num_labels = len(label_names)
print(f"Found {num_labels} labels: {label_names}")

# --- Split Data ---
train_df, test_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42,
    stratify=df["label"]
)

print(f"Train size: {len(train_df)} | Test size: {len(test_df)}")

Found 4 labels: ['get_exam_hall', 'get_exam_schedule', 'get_hall_info', 'get_seat_location']
Train size: 160 | Test size: 40


In [15]:
# --- Define Hyperparameters ---
MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 128

# --- Load Tokenizer ---
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

# --- Tokenize Text ---
train_encodings = tokenizer(
    list(train_df["text"]),
    truncation=True,
    padding=True,
    max_length=MAX_LENGTH
)
test_encodings = tokenizer(
    list(test_df["text"]),
    truncation=True,
    padding=True,
    max_length=MAX_LENGTH
)

# --- Create Custom Torch Dataset ---
class ExamDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Convert to tensors on-the-fly
        item = {key: torch.tensor(val[idx], dtype=torch.long) 
                for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ExamDataset(train_encodings, list(train_df["label"]))
test_dataset = ExamDataset(test_encodings, list(test_df["label"]))

print("Datasets created successfully.")

Datasets created successfully.


In [16]:
# --- Training Hyperparameters ---
EPOCHS = 10
LR = 5e-5
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 32
OUTPUT_DIR = "./exam_intent_model"

# --- Setup Device ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Load Model ---
model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)
model.to(device) # Move model to GPU

# --- Create Dataloaders ---
train_loader = DataLoader(
    train_dataset, 
    batch_size=TRAIN_BATCH_SIZE, 
    shuffle=True
)
eval_loader = DataLoader(
    test_dataset, 
    batch_size=EVAL_BATCH_SIZE, 
    shuffle=False
)

# --- Optimizer & Scheduler ---
optimizer = AdamW(model.parameters(), lr=LR)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.06 * total_steps), # 6% warmup
    num_training_steps=total_steps
)

print("Model, Dataloaders, and Optimizer are ready.")

Using device: cpu


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model, Dataloaders, and Optimizer are ready.


In [None]:
# --- Training Loop ---
best_eval_acc = 0.0
print("Starting training...")

for epoch in range(1, EPOCHS + 1):
    # ===== Train =====
    model.train()
    train_losses = []
    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_losses.append(loss.item())
    
    avg_train_loss = np.mean(train_losses)

    # ===== Evaluate =====
    model.eval()
    all_preds = []
    all_labels = []
    eval_losses = []
    with torch.no_grad():
        for batch in eval_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            logits = outputs.logits

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()

            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())
            eval_losses.append(loss.item())

    avg_eval_loss = np.mean(eval_losses)
    eval_acc = accuracy_score(all_labels, all_preds)

    print(f"Epoch {epoch}/{EPOCHS} â€” "
          f"train_loss: {avg_train_loss:.4f} â€” "
          f"eval_loss: {avg_eval_loss:.4f} â€” "
          f"eval_acc: {eval_acc:.4f}")

    # ===== Save Best Model =====
    if eval_acc > best_eval_acc:
        best_eval_acc = eval_acc
        model.save_pretrained(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)
        
        # **FIX**: Save the label names in the correct order
        with open(os.path.join(OUTPUT_DIR, "label_names.txt"), "w") as f:
            for label in label_names:
                f.write(label + "\n")
                
        print(f" Saved best model (acc {best_eval_acc:.4f}) to {OUTPUT_DIR}")

print(f"ðŸŽ¯ Training finished. Best eval accuracy: {best_eval_acc}")

Starting training...
Epoch 1/10 â€” train_loss: 1.3652 â€” eval_loss: 1.2344 â€” eval_acc: 0.7750
âœ… Saved best model (acc 0.7750) to ./exam_intent_model
Epoch 2/10 â€” train_loss: 1.0162 â€” eval_loss: 0.6628 â€” eval_acc: 0.9500
âœ… Saved best model (acc 0.9500) to ./exam_intent_model
Epoch 3/10 â€” train_loss: 0.4820 â€” eval_loss: 0.3019 â€” eval_acc: 1.0000
âœ… Saved best model (acc 1.0000) to ./exam_intent_model
Epoch 4/10 â€” train_loss: 0.1973 â€” eval_loss: 0.1505 â€” eval_acc: 1.0000
Epoch 5/10 â€” train_loss: 0.0898 â€” eval_loss: 0.0927 â€” eval_acc: 1.0000
Epoch 6/10 â€” train_loss: 0.0555 â€” eval_loss: 0.0550 â€” eval_acc: 1.0000
Epoch 7/10 â€” train_loss: 0.0405 â€” eval_loss: 0.0457 â€” eval_acc: 1.0000
Epoch 8/10 â€” train_loss: 0.0364 â€” eval_loss: 0.0467 â€” eval_acc: 1.0000
Epoch 9/10 â€” train_loss: 0.0331 â€” eval_loss: 0.0453 â€” eval_acc: 1.0000
Epoch 10/10 â€” train_loss: 0.0315 â€” eval_loss: 0.0438 â€” eval_acc: 1.0000
ðŸŽ¯ Training finished. Best eval acc

In [18]:
# --- Load Saved Model and Tokenizer ---
model_path = "./exam_intent_model"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
    inference_tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
    inference_model = DistilBertForSequenceClassification.from_pretrained(model_path)
    inference_model.to(device)
    inference_model.eval()

    # **THE FIX**: Load the labels from the file we saved during training
    # This prevents any mismatch between index and label name
    inference_labels = []
    with open(os.path.join(model_path, "label_names.txt"), "r") as f:
        inference_labels = [line.strip() for line in f.readlines()]
    
    print(f"Loaded model and labels for inference: {inference_labels}")

except OSError:
    print(f"Error: Could not load model from {model_path}. Was training successful?")
    # Stop if the model doesn't exist
    raise Exception("Model not found")


# --- Prediction Function ---
def predict_intent(text):
    inputs = inference_tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        padding=True, 
        max_length=128
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = inference_model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        pred_idx = torch.argmax(probs, dim=-1).item()
    
    # Map the predicted index back to the *correct* label name
    return inference_labels[pred_idx]

# --- Test Examples ---
test_texts = [
    "Where is my exam hall?",
    "What is my seat number?",
    "When is my next exam?",
    "How many benches are in hall 2?",
    "Tell me my bench for physics",
    "List all available exam halls",
    "Which exam do I have today?",
    "Where will I sit for chemistry?"
]

for txt in test_texts:
    pred = predict_intent(txt)
    print(f" Text: {txt}\n Predicted intent: {pred}\n")

Loaded model and labels for inference: ['get_exam_hall', 'get_exam_schedule', 'get_hall_info', 'get_seat_location']
 Text: Where is my exam hall?
 Predicted intent: get_exam_hall

 Text: What is my seat number?
 Predicted intent: get_seat_location

 Text: When is my next exam?
 Predicted intent: get_exam_schedule

 Text: How many benches are in hall 2?
 Predicted intent: get_hall_info

 Text: Tell me my bench for physics
 Predicted intent: get_seat_location

 Text: List all available exam halls
 Predicted intent: get_hall_info

 Text: Which exam do I have today?
 Predicted intent: get_exam_schedule

 Text: Where will I sit for chemistry?
 Predicted intent: get_seat_location

