In [1]:
import json
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
# from datasets import Dataset, load_metric
from peft import LoraConfig, get_peft_model
with open("../dataset/telecom_intent_dataset.json", "r") as file:
    dataset = json.load(file)

  from .autonotebook import tqdm as notebook_tqdm
2026-01-20 15:54:21.982420: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preprocessing 

In [2]:
# Flatten dataset
examples = []

for dict in dataset:
        tag = dict['tag']
        for  pattern in dict['patterns']:
            examples.append({
                "input": pattern,
                "output": tag
            })

In [3]:
examples

[{'input': 'Why was I charged extra this month?', 'output': 'billing_issue'},
 {'input': "I don't understand the roaming fees on my bill.",
  'output': 'billing_issue'},
 {'input': 'Can you explain the late payment penalty?',
  'output': 'billing_issue'},
 {'input': 'My bill shows a charge I never made.', 'output': 'billing_issue'},
 {'input': "I'm being billed for services I cancelled.",
  'output': 'billing_issue'},
 {'input': 'There’s a mysterious $10 fee—what is it for?',
  'output': 'billing_issue'},
 {'input': 'Why is my data overage cost so high?', 'output': 'billing_issue'},
 {'input': 'I was promised a discount but it’s not on my invoice.',
  'output': 'billing_issue'},
 {'input': 'Please remove the international call charge from my bill.',
  'output': 'billing_issue'},
 {'input': 'My bill is higher than usual, can you check it?',
  'output': 'billing_issue'},
 {'input': 'I was double‑charged for my last payment.',
  'output': 'billing_issue'},
 {'input': 'The tax amount on my

In [4]:
import pandas as pd
dataset_pd = pd.DataFrame(examples)
dataset_pd.head()

Unnamed: 0,input,output
0,Why was I charged extra this month?,billing_issue
1,I don't understand the roaming fees on my bill.,billing_issue
2,Can you explain the late payment penalty?,billing_issue
3,My bill shows a charge I never made.,billing_issue
4,I'm being billed for services I cancelled.,billing_issue


In [5]:
dataset_pd['output'].value_counts()

output
billing_issue      50
internet_issue     50
calls_issue        50
bundle_change      50
balance_inquiry    50
recharge           50
account_support    50
complaint          50
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
dataset_pd['label'] = label_encoder.fit_transform(dataset_pd['output'])
dataset_pd.head()

Unnamed: 0,input,output,label
0,Why was I charged extra this month?,billing_issue,2
1,I don't understand the roaming fees on my bill.,billing_issue,2
2,Can you explain the late payment penalty?,billing_issue,2
3,My bill shows a charge I never made.,billing_issue,2
4,I'm being billed for services I cancelled.,billing_issue,2


In [10]:
#2. Save the encoder to a file
import pickle
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)

In [7]:
# Save label mapping for inference
del dict  # restores the built-in dict
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping:", label_mapping)

Label mapping: {'account_support': np.int64(0), 'balance_inquiry': np.int64(1), 'billing_issue': np.int64(2), 'bundle_change': np.int64(3), 'calls_issue': np.int64(4), 'complaint': np.int64(5), 'internet_issue': np.int64(6), 'recharge': np.int64(7)}


In [8]:
train_df, val_df = train_test_split(dataset_pd, test_size=0.2, stratify=dataset_pd["label"], random_state=42)

In [9]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
def tokenize_function(df):
    return tokenizer(df["input"])

In [11]:
from datasets import Dataset
train_dataset= Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)

In [12]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|██████████| 320/320 [00:00<00:00, 37306.54 examples/s]
Map: 100%|██████████| 80/80 [00:00<00:00, 15629.25 examples/s]


In [13]:
train_dataset[-1]

{'label': tensor(1),
 'input_ids': tensor([ 101, 2507, 2033, 2026, 5703, 1012,  102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1])}

In [14]:
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label_encoder.classes_))

# Apply LoRA (PEFT)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_lin", "v_lin"],  # DistilBERT attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()  # verify only LoRA params are trainable

trainable params: 744,200 || all params: 67,703,824 || trainable%: 1.0992


In [16]:
import evaluate

# Load metric
metric = evaluate.load("accuracy")

# Example compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)  # for classification
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results_lora",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.8926,1.566858,0.675
2,0.8675,0.623227,0.825
3,0.5267,0.490319,0.7875
4,0.3595,0.455777,0.8125
5,0.2397,0.495457,0.8125
6,0.2784,0.408877,0.8375
7,0.2223,0.462204,0.825
8,0.1063,0.44779,0.8125
9,0.134,0.439961,0.825
10,0.1069,0.442694,0.8375


TrainOutput(global_step=200, training_loss=0.4909278628230095, metrics={'train_runtime': 15.2966, 'train_samples_per_second': 209.196, 'train_steps_per_second': 13.075, 'total_flos': 14942374864896.0, 'train_loss': 0.4909278628230095, 'epoch': 10.0})

In [17]:
model.save_pretrained("./lora_finetuned_model")
tokenizer.save_pretrained("./lora_finetuned_model")

('./lora_finetuned_model/tokenizer_config.json',
 './lora_finetuned_model/special_tokens_map.json',
 './lora_finetuned_model/vocab.txt',
 './lora_finetuned_model/added_tokens.json',
 './lora_finetuned_model/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
import torch

# Paths
lora_path = "lora_finetuned_model"
base_model_name = "distilbert-base-uncased" 

# 1. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# 2. Load, Merge, and Optimize
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=8  
)
model = PeftModel.from_pretrained(base_model, lora_path)
model = model.merge_and_unload() # Fuses LoRA weights into base weights

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Inference function
def predict_intent(text):
    # .to(device) ensures tensors are on GPU if available
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    
    with torch.inference_mode(): # Faster/lighter than no_grad()
        outputs = model(**inputs)
        logits = outputs.logits
        pred_index = logits.argmax(dim=-1).item()
    
    # Correcting the inverse_transform call
    return label_encoder.inverse_transform([pred_index])[0]

# Example usage
text = "Can I pause my current bundle?"
try:
    print("Predicted intent:", predict_intent(text))
except NameError:
    print("Note: Ensure 'label_encoder' is defined/loaded before running.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted intent: bundle_change


In [None]:
model.save_pretrained("models/intent_classifier_merged")
tokenizer.save_pretrained("models/intent_classifier_merged")