In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers
!pip install scikit-learn pandas numpy

Looking in indexes: https://download.pytorch.org/whl/cu121


In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DistilBertPreTrainedModel, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from collections import defaultdict
import json
import os

In [3]:
import gdown

url = "https://drive.google.com/uc?id=16o0OPLLfDV4qP0mfpaxy69qtrscgvqje"
url2 = "https://drive.google.com/uc?id=11AKlIs3Re9gdwba2QmQuKEM2kUFicCp0"
url3 = "https://drive.google.com/uc?id=1AwKP4vY7DOgsJyRG9rr9qQjiKdlyDODr"


gdown.download(url, "/content/nlu_training_data.csv", quiet=False)
gdown.download(url2, "/content/nlu_training_data_2.csv")
gdown.download(url3, "/content/nlu_training_data_3.csv")

TRAINING_FILES = [
    'nlu_training_data.csv',    # Your first labeled file
    'nlu_training_data_2.csv',   # The new file you just created
    'nlu_training_data_3.csv'   # Customer service specific intents (track_order, request_refund, report_delivery_delay, other)
]



Downloading...
From: https://drive.google.com/uc?id=16o0OPLLfDV4qP0mfpaxy69qtrscgvqje
To: /content/nlu_training_data.csv
100%|██████████| 6.57M/6.57M [00:00<00:00, 37.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=11AKlIs3Re9gdwba2QmQuKEM2kUFicCp0
To: /content/nlu_training_data_2.csv
100%|██████████| 131k/131k [00:00<00:00, 3.61MB/s]
Downloading...
From: https://drive.google.com/uc?id=1AwKP4vY7DOgsJyRG9rr9qQjiKdlyDODr
To: /content/nlu_training_data_3.csv
100%|██████████| 22.8k/22.8k [00:00<00:00, 50.9MB/s]


In [4]:
# --- Configuration ---
MODEL_NAME = 'distilbert-base-uncased'
MODEL_SAVE_PATH = '/content/nlu_model'

# OPTIMIZED HYPERPARAMETERS for ~100K+ samples, 9 intents, 3 sentiments
NUM_EPOCHS = 4           # Reduced from 4 (large dataset converges faster)
BATCH_SIZE = 16          # Increased from 16 (better GPU utilization, faster training)
LEARNING_RATE = 2e-5     # Reduced from 5e-5 (more stable for large datasets)

# Optional: Add learning rate scheduler and early stopping
USE_SCHEDULER = True     # Use learning rate decay
WARMUP_STEPS = 1000       # Gradual warmup for stability
WEIGHT_DECAY = 0.02      # L2 regularization to prevent overfitting

# For very large datasets (100K+), you may also consider:
# - BATCH_SIZE = 64 (if you have sufficient GPU memory)
# - Gradient accumulation if GPU memory is limited
# - Early stopping based on validation loss

In [5]:
# --- 1. Define the Custom Multi-Task Model ---
# This is the core of the DL model
# It has a shared DistilBERT base and two separate output layers.
class MultiTaskDistilBert(DistilBertPreTrainedModel):
    def __init__(self, config, num_intent_labels, num_sentiment_labels):
        super().__init__(config)
        self.distilbert = DistilBertModel(config)

        # Classifier head for Intent
        self.intent_classifier = nn.Linear(config.dim, num_intent_labels)
        # Classifier head for Sentiment
        self.sentiment_classifier = nn.Linear(config.dim, num_sentiment_labels)

        # Initialize weights
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,  # Not used directly here, handled in training loop
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Get the last hidden state from the base DistilBERT model
        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Get the [CLS] token representation (for classification)
        pooled_output = distilbert_output[0][:, 0]  # [batch_size, hidden_dim]

        # Pass the output through each specific classifier head
        intent_logits = self.intent_classifier(pooled_output)
        sentiment_logits = self.sentiment_classifier(pooled_output)

        # Return the logits for both tasks
        return (intent_logits, sentiment_logits)

In [6]:
# --- 2. Create a Custom PyTorch Dataset ---
class NLU_Dataset(Dataset):
    def __init__(self, texts, intent_labels, sentiment_labels, tokenizer, max_len=128):
        self.texts = texts
        self.intent_labels = intent_labels
        self.sentiment_labels = sentiment_labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'intent_label': torch.tensor(self.intent_labels[idx], dtype=torch.long),
            'sentiment_label': torch.tensor(self.sentiment_labels[idx], dtype=torch.long)
        }

In [7]:
# --- 3. Helper Functions ---
def load_data(csv_path_list):
    """Loads and preprocesses data from a list of CSVs."""

    df_list = []
    print("Loading multiple data files...")
    for path in csv_path_list:
        try:
            df_part = pd.read_csv(path)
            df_list.append(df_part)
            print(f"Successfully loaded {len(df_part)} samples from '{path}'")
        except FileNotFoundError:
            print(f"Warning: Training data file '{path}' not found. Skipping.")
        except Exception as e:
            print(f"Error loading '{path}': {e}. Skipping.")

    if not df_list:
        print("Error: No training data could be loaded. Exiting.")
        return None, None, None, None

    # Combine all dataframes into one
    df = pd.concat(df_list, ignore_index=True)

    # Drop any rows where text might be missing
    df = df.dropna(subset=['text'])

    print(f"\nTotal combined training samples: {len(df)}")

    # Create mappings for our labels to convert them to numbers
    intent_labels = {label: i for i, label in enumerate(df['intent'].unique())}
    sentiment_labels = {label: i for i, label in enumerate(df['sentiment'].unique())}

    # Save the mappings so our agent can use them later
    label_info = {
        'intent_labels': intent_labels,
        'sentiment_labels': sentiment_labels
    }

    # Apply the mappings to the dataframe
    df['intent_label'] = df['intent'].map(intent_labels)
    df['sentiment_label'] = df['sentiment'].map(sentiment_labels)

    return df, label_info, len(intent_labels), len(sentiment_labels)

def compute_metrics(preds, labels):
    """Calculates accuracy and F1 score."""
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1_weighted': f1
    }

In [8]:
# --- 4. Main Training Function ---
def train():
    print("Step 1: Loading and preprocessing combined data...")
    # *** MODIFIED: Pass the list of files ***
    df, label_info, num_intent_labels, num_sentiment_labels = load_data(TRAINING_FILES)
    if df is None:
        return

    # Split the *combined* data into training and validation sets
    df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, stratify=df['intent'])

    print(f"Total training samples: {len(df_train)}, Total validation samples: {len(df_val)}")
    print(f"Found {num_intent_labels} total intents and {num_sentiment_labels} total sentiments.")
    print("Intents:", list(label_info['intent_labels'].keys()))

    print("\nStep 2: Initializing tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Load our custom model
    model = MultiTaskDistilBert.from_pretrained(
        MODEL_NAME,
        num_intent_labels=num_intent_labels,
        num_sentiment_labels=num_sentiment_labels
    )

    # Set up datasets and dataloaders
    train_dataset = NLU_Dataset(
        texts=df_train.text.to_numpy(),
        intent_labels=df_train.intent_label.to_numpy(),
        sentiment_labels=df_train.sentiment_label.to_numpy(),
        tokenizer=tokenizer
    )
    val_dataset = NLU_Dataset(
        texts=df_val.text.to_numpy(),
        intent_labels=df_val.intent_label.to_numpy(),
        sentiment_labels=df_val.sentiment_label.to_numpy(),
        tokenizer=tokenizer
    )

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Set up optimizer and loss functions
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # OPTIMIZED: AdamW with weight decay for regularization
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

    # OPTIMIZED: Learning rate scheduler with warmup
    from transformers import get_linear_schedule_with_warmup
    total_steps = len(train_loader) * NUM_EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_STEPS,
        num_training_steps=total_steps
    ) if USE_SCHEDULER else None

    intent_loss_fn = nn.CrossEntropyLoss().to(device)
    sentiment_loss_fn = nn.CrossEntropyLoss().to(device)

    print(f"\nStep 3: Starting training for {NUM_EPOCHS} epochs on {device}...")
    print(f"Batch size: {BATCH_SIZE}, Learning rate: {LEARNING_RATE}")
    print(f"Total training steps: {total_steps}")
    if scheduler:
        print(f"Using warmup steps: {WARMUP_STEPS}")

    # OPTIMIZED: Track best validation loss for early stopping
    best_val_loss = float('inf')
    patience = 2
    patience_counter = 0

    for epoch in range(NUM_EPOCHS):
        # --- Training ---
        model.train()
        total_train_loss = 0

        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            intent_labels = batch['intent_label'].to(device)
            sentiment_labels = batch['sentiment_label'].to(device)

            model.zero_grad()
            intent_logits, sentiment_logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            loss_intent = intent_loss_fn(intent_logits, intent_labels)
            loss_sentiment = sentiment_loss_fn(sentiment_logits, sentiment_labels)
            total_loss = loss_intent + loss_sentiment

            total_train_loss += total_loss.item()
            total_loss.backward()
            optimizer.step()

            # Update learning rate scheduler
            if scheduler:
                scheduler.step()

            # Progress logging every 1000 batches
            if (batch_idx + 1) % 1000 == 0:
                current_lr = scheduler.get_last_lr()[0] if scheduler else LEARNING_RATE
                print(f"  Batch {batch_idx + 1}/{len(train_loader)}, Loss: {total_loss.item():.4f}, LR: {current_lr:.2e}")

        avg_train_loss = total_train_loss / len(train_loader)

        # --- Validation ---
        model.eval()
        total_val_loss = 0
        all_intent_preds, all_intent_labels = [], []
        all_sentiment_preds, all_sentiment_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                intent_labels = batch['intent_label'].to(device)
                sentiment_labels = batch['sentiment_label'].to(device)

                intent_logits, sentiment_logits = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                loss_intent = intent_loss_fn(intent_logits, intent_labels)
                loss_sentiment = sentiment_loss_fn(sentiment_logits, sentiment_labels)
                total_loss = loss_intent + loss_sentiment
                total_val_loss += total_loss.item()

                intent_preds = torch.argmax(intent_logits, dim=1).cpu().numpy()
                sentiment_preds = torch.argmax(sentiment_logits, dim=1).cpu().numpy()

                all_intent_preds.extend(intent_preds)
                all_intent_labels.extend(intent_labels.cpu().numpy())
                all_sentiment_preds.extend(sentiment_preds)
                all_sentiment_labels.extend(sentiment_labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)

        intent_metrics = compute_metrics(all_intent_preds, all_intent_labels)
        sentiment_metrics = compute_metrics(all_sentiment_preds, all_sentiment_labels)

        print(f"\n--- Epoch {epoch + 1}/{NUM_EPOCHS} ---")
        print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
        print(f"Intent Metrics:     {intent_metrics}")
        print(f"Sentiment Metrics:  {sentiment_metrics}")

        # OPTIMIZED: Early stopping check
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            print(f"✓ New best validation loss: {best_val_loss:.4f}")
        else:
            patience_counter += 1
            print(f"⚠ No improvement for {patience_counter} epoch(s)")
            if patience_counter >= patience:
                print(f"Early stopping triggered after {epoch + 1} epochs")
                break

    print("\nStep 4: Training complete. Saving model...")

    os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
    model.save_pretrained(MODEL_SAVE_PATH)
    tokenizer.save_pretrained(MODEL_SAVE_PATH)

    with open(os.path.join(MODEL_SAVE_PATH, 'label_info.json'), 'w') as f:
        json.dump(label_info, f, indent=4)

    print(f"Model, tokenizer, and label info saved to '{MODEL_SAVE_PATH}'")
    print(f"Best validation loss achieved: {best_val_loss:.4f}")
    print("Your NLU model is now trained on the combined dataset!")

In [9]:
train()

Step 1: Loading and preprocessing combined data...
Loading multiple data files...
Successfully loaded 100000 samples from 'nlu_training_data.csv'
Successfully loaded 1995 samples from 'nlu_training_data_2.csv'
Successfully loaded 450 samples from 'nlu_training_data_3.csv'

Total combined training samples: 102445
Total training samples: 81956, Total validation samples: 20489
Found 10 total intents and 3 total sentiments.
Intents: ['provide_feedback_on_service', 'report_order_content_issue', 'generic_unspecified_feedback', 'comment_on_platform_experience', 'comment_on_product_quality', 'manage_order', 'track_order', 'report_delivery_delay', 'request_refund', 'other']

Step 2: Initializing tokenizer and model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of MultiTaskDistilBert were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['intent_classifier.bias', 'intent_classifier.weight', 'sentiment_classifier.bias', 'sentiment_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Step 3: Starting training for 4 epochs on cuda...
Batch size: 16, Learning rate: 2e-05
Total training steps: 20492
Using warmup steps: 1000
  Batch 1000/5123, Loss: 0.0094, LR: 2.00e-05
  Batch 2000/5123, Loss: 0.0063, LR: 1.90e-05
  Batch 3000/5123, Loss: 0.0026, LR: 1.79e-05
  Batch 4000/5123, Loss: 0.0018, LR: 1.69e-05
  Batch 5000/5123, Loss: 0.0018, LR: 1.59e-05

--- Epoch 1/4 ---
Train Loss: 0.1167 | Val Loss: 0.0075
Intent Metrics:     {'accuracy': 0.9990238664649324, 'f1_weighted': 0.9988389847601868}
Sentiment Metrics:  {'accuracy': 0.9994631265557128, 'f1_weighted': 0.999463520141371}
✓ New best validation loss: 0.0075
  Batch 1000/5123, Loss: 0.0009, LR: 1.47e-05
  Batch 2000/5123, Loss: 0.0006, LR: 1.37e-05
  Batch 3000/5123, Loss: 0.0005, LR: 1.27e-05
  Batch 4000/5123, Loss: 0.0004, LR: 1.17e-05
  Batch 5000/5123, Loss: 0.0003, LR: 1.06e-05

--- Epoch 2/4 ---
Train Loss: 0.0044 | Val Loss: 0.0023
Intent Metrics:     {'accuracy': 0.9995607399092196, 'f1_weighted': 0.99956