# Fine-tune Intent Classification Model
## Customer Service Chatbot - Intent Classifier Training

This notebook fine-tunes a DistilBERT model for intent classification.

**Platform:** Google Colab or Kaggle

**Steps:**
1. Install dependencies
2. Load training data
3. Prepare dataset
4. Fine-tune DistilBERT
5. Evaluate and save model

## 1. Install Dependencies

In [None]:
!pip install transformers datasets torch scikit-learn accelerate -q

## 2. Import Libraries

In [None]:
import json
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

print(f"Using device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

## 3. Load Training Data

**Note:** Upload `train_data.json` and `val_data.json` to Colab/Kaggle before running this cell.

In Colab: Click folder icon â†’ Upload files

In Kaggle: Add files in the Input section

In [None]:
# Load data
with open('train_data.json', 'r') as f:
    train_data = json.load(f)

with open('val_data.json', 'r') as f:
    val_data = json.load(f)

print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")
print(f"\nSample: {train_data[0]}")

## 4. Prepare Dataset

In [None]:
# Create label mapping
unique_labels = sorted(list(set([item['label'] for item in train_data])))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"Number of intents: {len(unique_labels)}")
print(f"Labels: {unique_labels}")

# Convert labels to IDs
for item in train_data:
    item['labels'] = label2id[item['label']]

for item in val_data:
    item['labels'] = label2id[item['label']]

# Create Hugging Face datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

print(f"\nDataset created successfully!")

## 5. Tokenize Data

In [None]:
# Load tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

print("âœ… Tokenization complete!")

## 6. Load Model

In [None]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

print(f"âœ… Model loaded: {model_name}")
print(f"ðŸ“Š Parameters: {model.num_parameters():,}")

## 7. Define Metrics

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

## 8. Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./intent_classifier",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    logging_steps=10,
    warmup_steps=100,
)

## 9. Train Model

In [None]:
# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train
print("ðŸš€ Starting training...")
trainer.train()
print("âœ… Training complete!")

## 10. Evaluate Model

In [None]:
# Evaluate
results = trainer.evaluate()
print("\nðŸ“Š Evaluation Results:")
for key, value in results.items():
    print(f"  {key}: {value:.4f}")

## 11. Test Predictions

In [None]:
# Test the model
def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class_id = predictions.argmax().item()
    confidence = predictions[0][predicted_class_id].item()
    
    return id2label[predicted_class_id], confidence

# Test examples
test_examples = [
    "Hello, how are you?",
    "Where is my package?",
    "How much does this cost?",
    "I want to return my order",
    "What products do you have?"
]

print("ðŸ§ª Testing predictions:\n")
for example in test_examples:
    intent, confidence = predict_intent(example)
    print(f"Text: '{example}'")
    print(f"  â†’ Intent: {intent} (confidence: {confidence:.2%})\n")

## 12. Save Model

In [None]:
# Save model and tokenizer
output_dir = "./intent_classifier_final"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

# Save label mappings
with open(f"{output_dir}/label_mappings.json", 'w') as f:
    json.dump({
        'label2id': label2id,
        'id2label': id2label
    }, f, indent=2)

print(f"âœ… Model saved to {output_dir}")
print("\nðŸ“¦ Download these files to use in your Flask app:")
print("  - All files in intent_classifier_final/")

## 13. Download Model (Colab only)

Run this if you're on Google Colab to download the trained model:

In [None]:
# Zip the model directory
!zip -r intent_classifier_final.zip intent_classifier_final/

# Download (Colab only)
from google.colab import files
files.download('intent_classifier_final.zip')