In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers
!pip install scikit-learn pandas numpy

In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, DistilBertPreTrainedModel, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from collections import defaultdict
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- Configuration ---
MODEL_NAME = 'distilbert-base-uncased'
TRAIN_DATA_CSV = '../data/nlu_training_data.csv'
MODEL_SAVE_PATH = '../model/nlu_model'
NUM_EPOCHS = 4
BATCH_SIZE = 16
LEARNING_RATE = 5e-5

In [3]:
# --- 1. Define the Custom Multi-Task Model ---
# This is the core of the DL model
# It has a shared DistilBERT base and two separate output layers.
class MultiTaskDistilBert(DistilBertPreTrainedModel):
    def __init__(self, config, num_intent_labels, num_sentiment_labels):
        super().__init__(config)
        self.distilbert = DistilBertModel(config)
        
        # Classifier head for Intent
        self.intent_classifier = nn.Linear(config.dim, num_intent_labels)
        # Classifier head for Sentiment
        self.sentiment_classifier = nn.Linear(config.dim, num_sentiment_labels)
        
        # Initialize weights
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,  # Not used directly here, handled in training loop
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Get the last hidden state from the base DistilBERT model
        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Get the [CLS] token representation (for classification)
        pooled_output = distilbert_output[0][:, 0]  # [batch_size, hidden_dim]

        # Pass the output through each specific classifier head
        intent_logits = self.intent_classifier(pooled_output)
        sentiment_logits = self.sentiment_classifier(pooled_output)

        # Return the logits for both tasks
        return (intent_logits, sentiment_logits)

In [4]:
# --- 2. Create a Custom PyTorch Dataset ---
class NLU_Dataset(Dataset):
    def __init__(self, texts, intent_labels, sentiment_labels, tokenizer, max_len=128):
        self.texts = texts
        self.intent_labels = intent_labels
        self.sentiment_labels = sentiment_labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'intent_label': torch.tensor(self.intent_labels[idx], dtype=torch.long),
            'sentiment_label': torch.tensor(self.sentiment_labels[idx], dtype=torch.long)
        }

In [5]:
# --- 3. Helper Functions ---
def load_data(csv_path):
    """Loads and preprocesses data, creating label mappings."""
    try:
        df = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"Error: Training data file '{csv_path}' not found.")
        print("Please run 'label_data.py' first to create it.")
        return None, None, None, None

    # Create mappings for our labels to convert them to numbers
    intent_labels = {label: i for i, label in enumerate(df['intent'].unique())}
    sentiment_labels = {label: i for i, label in enumerate(df['sentiment'].unique())}
    
    # Save the mappings so our agent can use them later
    label_info = {
        'intent_labels': intent_labels,
        'sentiment_labels': sentiment_labels
    }
    
    # Apply the mappings to the dataframe
    df['intent_label'] = df['intent'].map(intent_labels)
    df['sentiment_label'] = df['sentiment'].map(sentiment_labels)
    
    return df, label_info, len(intent_labels), len(sentiment_labels)

def compute_metrics(preds, labels):
    """Calculates accuracy and F1 score."""
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1_weighted': f1
    }

In [6]:
# --- 4. Main Training Function ---
def train():
    print("Step 1: Loading and preprocessing data...")
    df, label_info, num_intent_labels, num_sentiment_labels = load_data(TRAIN_DATA_CSV)
    if df is None:
        return

    # Split the data into training and validation sets
    df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, stratify=df['intent'])
    
    print(f"Training samples: {len(df_train)}, Validation samples: {len(df_val)}")
    print(f"Found {num_intent_labels} intents and {num_sentiment_labels} sentiments.")

    print("\nStep 2: Initializing tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Load our custom model
    model = MultiTaskDistilBert.from_pretrained(
        MODEL_NAME,
        num_intent_labels=num_intent_labels,
        num_sentiment_labels=num_sentiment_labels
    )

    # Set up datasets and dataloaders
    train_dataset = NLU_Dataset(
        texts=df_train.text.to_numpy(),
        intent_labels=df_train.intent_label.to_numpy(),
        sentiment_labels=df_train.sentiment_label.to_numpy(),
        tokenizer=tokenizer
    )
    val_dataset = NLU_Dataset(
        texts=df_val.text.to_numpy(),
        intent_labels=df_val.intent_label.to_numpy(),
        sentiment_labels=df_val.sentiment_label.to_numpy(),
        tokenizer=tokenizer
    )

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Set up optimizer and loss functions
    # Force GPU usage - raise error if not available
    if not torch.cuda.is_available():
        raise RuntimeError("GPU (CUDA) is not available. This training requires a GPU.")
    
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    
    model.to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    # We use CrossEntropyLoss for both classification tasks
    intent_loss_fn = nn.CrossEntropyLoss().to(device)
    sentiment_loss_fn = nn.CrossEntropyLoss().to(device)

    print(f"\nStep 3: Starting training for {NUM_EPOCHS} epochs on {device}...")
    
    for epoch in range(NUM_EPOCHS):
        # --- Training ---
        model.train()
        total_train_loss = 0
        
        for batch in train_loader:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            intent_labels = batch['intent_label'].to(device)
            sentiment_labels = batch['sentiment_label'].to(device)

            # Zero gradients
            model.zero_grad()
            
            # Forward pass
            intent_logits, sentiment_logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            # Calculate combined loss
            loss_intent = intent_loss_fn(intent_logits, intent_labels)
            loss_sentiment = sentiment_loss_fn(sentiment_logits, sentiment_labels)
            total_loss = loss_intent + loss_sentiment # We can weigh these if one is more important
            
            total_train_loss += total_loss.item()
            
            # Backward pass
            total_loss.backward()
            optimizer.step()

        avg_train_loss = total_train_loss / len(train_loader)

        # --- Validation ---
        model.eval()
        total_val_loss = 0
        all_intent_preds, all_intent_labels = [], []
        all_sentiment_preds, all_sentiment_labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                intent_labels = batch['intent_label'].to(device)
                sentiment_labels = batch['sentiment_label'].to(device)

                intent_logits, sentiment_logits = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )
                
                loss_intent = intent_loss_fn(intent_logits, intent_labels)
                loss_sentiment = sentiment_loss_fn(sentiment_logits, sentiment_labels)
                total_loss = loss_intent + loss_sentiment
                
                total_val_loss += total_loss.item()
                
                # Get predictions
                intent_preds = torch.argmax(intent_logits, dim=1).cpu().numpy()
                sentiment_preds = torch.argmax(sentiment_logits, dim=1).cpu().numpy()
                
                all_intent_preds.extend(intent_preds)
                all_intent_labels.extend(intent_labels.cpu().numpy())
                all_sentiment_preds.extend(sentiment_preds)
                all_sentiment_labels.extend(sentiment_labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        
        # Calculate metrics
        intent_metrics = compute_metrics(all_intent_preds, all_intent_labels)
        sentiment_metrics = compute_metrics(all_sentiment_preds, all_sentiment_labels)
        
        print(f"\n--- Epoch {epoch + 1}/{NUM_EPOCHS} ---")
        print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
        print(f"Intent Metrics:     {intent_metrics}")
        print(f"Sentiment Metrics:  {sentiment_metrics}")

    print("\nStep 4: Training complete. Saving model...")
    
    # Ensure save directory exists
    os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
    
    # Save the model
    model.save_pretrained(MODEL_SAVE_PATH)
    # Save the tokenizer
    tokenizer.save_pretrained(MODEL_SAVE_PATH)
    # Save the label info
    with open(os.path.join(MODEL_SAVE_PATH, 'label_info.json'), 'w') as f:
        json.dump(label_info, f, indent=4)
        
    print(f"Model, tokenizer, and label info saved to '{MODEL_SAVE_PATH}'")

In [None]:
train()

Step 1: Loading and preprocessing data...
Training samples: 80000, Validation samples: 20000
Found 5 intents and 3 sentiments.

Step 2: Initializing tokenizer and model...
Training samples: 80000, Validation samples: 20000
Found 5 intents and 3 sentiments.

Step 2: Initializing tokenizer and model...


Some weights of MultiTaskDistilBert were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['intent_classifier.bias', 'intent_classifier.weight', 'sentiment_classifier.bias', 'sentiment_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using GPU: NVIDIA GeForce RTX 3050 Laptop GPU
GPU Memory: 4.00 GB

Step 3: Starting training for 4 epochs on cuda...

Step 3: Starting training for 4 epochs on cuda...
