In [11]:
import json
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns


In [20]:
MODEL_NAME = "roberta-base"
TRAIN_PATH = "../data/train_split.json"
TEST_PATH = "../data/test_split.json"
LABEL_MAP_PATH = "../data/label_mapping.json"
OUTPUT_DIR = "../models/intent_classifier"

In [21]:
# Load JSON to DataFrames
train_df = pd.read_json(TRAIN_PATH)
test_df = pd.read_json(TEST_PATH)

# Convert label to category and encode
all_labels = sorted(train_df["label"].unique())  # consistent label order
label2id = {label: idx for idx, label in enumerate(all_labels)}
id2label = {idx: label for label, idx in label2id.items()}

train_df["label_id"] = train_df["label"].map(label2id)
test_df["label_id"] = test_df["label"].map(label2id)

print("Label mapping:", label2id)
train_df.head()


Label mapping: {'needs_rag': 0, 'no_rag': 1}


Unnamed: 0,text,label,label_id
0,What's the process for reporting harassment?,needs_rag,0
1,I’m feeling really motivated by my team!,no_rag,1
2,How do I request a reference from HR?,needs_rag,0
3,I'm struggling to balance work and personal life,no_rag,1
4,You make the workplace feel like a second family,no_rag,1


In [23]:
from transformers import AutoTokenizer
from datasets import Dataset

MODEL_NAME = "roberta-base"

# Load the tokenizer for DeBERTa
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Define tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

# Convert pandas DataFrames to Hugging Face Datasets
train_ds = Dataset.from_pandas(
    train_df[["text", "label_id"]].rename(columns={"label_id": "label"})
)
test_ds = Dataset.from_pandas(
    test_df[["text", "label_id"]].rename(columns={"label_id": "label"})
)

# Apply tokenization
train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)


Map: 100%|██████████| 990/990 [00:00<00:00, 10460.80 examples/s]
Map: 100%|██████████| 248/248 [00:00<00:00, 16532.69 examples/s]


In [24]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }


In [26]:
import numpy as np
from sklearn.metrics import accuracy_score , f1_score

In [27]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch", 
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0106,0.022392,0.995968,0.995967,0.995999,0.995968
2,0.0006,0.001382,1.0,1.0,1.0,1.0
3,0.0004,0.000198,1.0,1.0,1.0,1.0
4,0.0004,0.000178,1.0,1.0,1.0,1.0




TrainOutput(global_step=248, training_loss=0.051174977773635257, metrics={'train_runtime': 1859.9393, 'train_samples_per_second': 2.129, 'train_steps_per_second': 0.133, 'total_flos': 260479944806400.0, 'train_loss': 0.051174977773635257, 'epoch': 4.0})

In [29]:
model.save_pretrained("../models/intent_classifier/final")
tokenizer.save_pretrained("../models/intent_classifier/final")


('../models/intent_classifier/final\\tokenizer_config.json',
 '../models/intent_classifier/final\\special_tokens_map.json',
 '../models/intent_classifier/final\\vocab.json',
 '../models/intent_classifier/final\\merges.txt',
 '../models/intent_classifier/final\\added_tokens.json',
 '../models/intent_classifier/final\\tokenizer.json')