In [40]:
# --- Common Utilities and Setup ---
import os
import json
import torch
import transformers
import accelerate
import huggingface_hub
import peft
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup
from sklearn.utils import resample
from collections import Counter
import time
import onnxruntime as ort
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List
import pickle

print("peft:", peft.__version__)
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("Accelerate:", accelerate.__version__)
print("Huggingface Hub:", huggingface_hub.__version__)

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

peft: 0.14.0
Torch: 2.2.2
Transformers: 4.49.0
Accelerate: 1.4.0
Huggingface Hub: 0.29.1


In [41]:
# Device Selection Function
def get_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple MPS GPU")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using NVIDIA CUDA GPU")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

In [42]:
def update_model_dict(model_alias, MODEL_NAME):
    if not os.path.exists('model_dict.json'):
        model_dict = {}
    else:
        with open('model_dict.json', 'r') as file:
            model_dict = json.load(file)

    model_dict[model_alias] = MODEL_NAME

    with open('model_dict.json', 'w') as file:
        json.dump(model_dict, file)

In [43]:
def load_and_preprocess_data(filepath="./data/train-00000-of-00001-a5a7c6e4bb30b016.parquet"):
    """Loads and preprocesses the dataset."""
    df = pd.read_parquet(filepath)
    df = df[['conversation', 'issue_area']]
    print("Original distribution:\n", df['issue_area'].value_counts())
    label_encoder = LabelEncoder()
    df["labels"] = label_encoder.fit_transform(df["issue_area"])

    #saving Label-encoder
    label_encoder_path = f"model-metric/{model_alias}/label_encoder.pkl"
    os.makedirs(os.path.dirname(label_encoder_path), exist_ok=True)
    with open(label_encoder_path, "wb") as f:
        pickle.dump(label_encoder, f)
        
    return df, label_encoder

In [44]:
def balance_dataset(df, max_count=100, random_state=42):
    """Balances the dataset using oversampling."""
    balanced_df = pd.DataFrame()
    for issue in df['issue_area'].unique():
        subset = df[df['issue_area'] == issue]
        balanced_subset = resample(subset, replace=True, n_samples=max_count, random_state=random_state)
        balanced_df = pd.concat([balanced_df, balanced_subset])
    return balanced_df.sample(frac=1, random_state=random_state).reset_index(drop=True)


In [45]:
def preprocess_conversation(conversation):
    """Preprocesses a conversation."""
    if isinstance(conversation, list):
        return " ".join([turn.get('text', '') for turn in conversation if isinstance(turn, dict)])
    return str(conversation) #.lower()

In [46]:
# Define PyTorch Dataset for T5
class T5Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512, prefix="classify: "):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.prefix = prefix
        self.label_names = dataframe['issue_area'].unique()
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        row = self.data.iloc[index]
        
        # Prepare input with prefix
        text = self.prefix + str(row["conversation"])
        
        # For T5, we need to prepare decoder input
        target_text = str(row["issue_area"])  # Use actual label text instead of numeric index
        
        # Tokenize inputs
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # Tokenize targets - using text_target parameter instead of as_target_tokenizer
        labels = self.tokenizer(
            text_target=target_text,
            truncation=True,
            padding="max_length",
            max_length=64,  # Shorter for labels
            return_tensors="pt"
        )
        
        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)
        labels = labels["input_ids"].squeeze(0)
        
        # Replace tokenizer padding token id with -100 so it's ignored in loss calculation
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

In [47]:
def create_dataloaders(df, tokenizer, batch_size=8, train_ratio=0.75):
    """Creates train and test DataLoaders."""
    train_size = int(train_ratio * len(df))
    train_df, test_df = df[:train_size], df[train_size:]
    
    train_dataset = T5Dataset(train_df, tokenizer)
    test_dataset = T5Dataset(test_df, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, test_loader, test_df

In [79]:
class T5SmallWithLoRA(nn.Module):
    def __init__(self, lora_r=8, lora_alpha=16, lora_dropout=0.1):
        super(T5SmallWithLoRA, self).__init__()
        # Load the base T5 model
        self.t5 = T5ForConditionalGeneration.from_pretrained(
            "google-t5/t5-small"
        )
        
        # LoRA Configuration
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM,  # For T5
            r=lora_r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            target_modules=["q", "v"]  # T5 attention modules to apply LoRA
        )
        self.t5 = get_peft_model(self.t5, lora_config)
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.t5(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            labels=labels
        )
        # For classification tasks, you might need to process the output logits further
        # but for training with T5, the loss is calculated correctly out of the box
        return outputs
        
    # Add a prediction method for better control over generation
    def predict(self, input_ids, attention_mask, tokenizer, max_length=32):
        """Generate text predictions with more control"""
        generated_ids = self.t5.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_beams=4,  # Using more beams for better quality
            length_penalty=0.6,  # Prefer shorter sequences
            early_stopping=True
        )
        
        # Decode the generated IDs
        predictions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        return predictions
    def print_trainable_parameters(self):
                """
                Prints the number of trainable parameters in the model.
                """
                trainable_params = 0
                all_param = 0
                for _, param in self.named_parameters():
                    all_param += param.numel()
                    if param.requires_grad:
                        trainable_params += param.numel()
                print(
                    f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}%"
                )        

In [129]:
# from torchinfo import summary

# # Load models
# tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
# model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

# # Create proper input format
# # T5 expects input_ids and attention_mask
# batch_size = 1
# seq_length = 512

# # Both model1 and model2 should be identical since they're loading the same model
# # just using different import functions

# # Create dummy input data within valid token range
# input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, seq_length))
# attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long)

# # For T5, we need decoder_input_ids for the complete picture
# # Usually the first token is the pad token or bos token
# decoder_input_ids = torch.ones(batch_size, 1, dtype=torch.long) * model.config.decoder_start_token_id

# # Proper summary call with appropriate input shapes
# summary(
#     model,
#     input_data=[input_ids, attention_mask, decoder_input_ids],
#     depth=4,  # Reduce depth for cleaner output
#     col_names=["input_size", "output_size", "num_params", "trainable"],
# )

In [130]:
# model2

In [52]:
# Function to compute class weights
def compute_class_weights(labels, num_classes):
    counter = Counter(labels)
    total_samples = len(labels)
    weights = [total_samples / (num_classes * counter[i]) for i in range(num_classes)]
    return torch.tensor(weights, dtype=torch.float)

In [65]:
import os
import time
import torch
import numpy as np
import warnings
from collections import Counter
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def train_model(model, train_loader, model_alias, epochs=3, learning_rate=5e-5, tokenizer=None):
    """Trains the model and saves logs, metrics, and model weights with mixed precision training."""
    
    # Suppress warnings
    warnings.filterwarnings("ignore")
    
    # Device setup with proper fallback
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple MPS GPU")
        os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'  # Prevent OOM errors
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using NVIDIA CUDA GPU")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    
    model.to(device)
    
    # Create directory for storing model metrics
    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)
    
    # TensorBoard writer in the model directory
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter(log_dir=model_dir)
    
    # Training hyperparameters
    batch_size = 1  # Adjusted for memory constraints
    gradient_accumulation_steps = 4
    max_norm = 1.0  # Gradient clipping
    
    # Override forward method to save memory
    def forward_no_hidden_states(self, input_ids, attention_mask, labels):
        outputs = self.t5(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=False  # Disable hidden states to save memory
        )
        return outputs.logits, outputs
    
    # Replace the original forward with memory-optimized version
    original_forward = model.forward
    model.forward = forward_no_hidden_states.__get__(model, type(model))
    
    # Set up optimizer with weight decay
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    
    # Mixed precision setup
    scaler = GradScaler()
    
    # Make sure all parameters require gradients
    for param in model.parameters():
        param.requires_grad = True
    
    # Metrics tracking function
    def calculate_metrics(logits, labels):
        """Calculate accuracy, precision, recall, F1, and perplexity."""
        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
        labels = labels.detach().cpu().numpy()
        
        # Filter out padding tokens (-100)
        valid_indices = labels != -100
        if np.any(valid_indices):
            labels = labels[valid_indices]
            preds = preds[valid_indices]
            
            if len(labels) > 0:
                accuracy = accuracy_score(labels, preds)
                precision = precision_score(labels, preds, average='weighted', zero_division=0)
                recall = recall_score(labels, preds, average='weighted', zero_division=0)
                f1 = f1_score(labels, preds, average='weighted', zero_division=0)
                
                # Calculate perplexity (avoiding divide by zero)
                logits_valid = logits[valid_indices].detach().cpu()
                logits_softmax = torch.softmax(logits_valid, dim=-1).numpy()
                entropy = -np.sum(logits_softmax * np.log(logits_softmax + 1e-12), axis=1)
                perplexity = np.exp(entropy.mean())
                
                return accuracy, precision, recall, f1, perplexity
        
        # Return default values if no valid predictions
        return 0.0, 0.0, 0.0, 0.0, 0.0
    
    epoch_losses = []
    metrics_data = []
    logits_store = []
    
    for epoch in range(epochs):
        start_time = time.time()
        model.train()
        running_loss = 0.0
        
        # For collecting data for metrics
        total_accuracy = 0.0
        total_precision = 0.0
        total_recall = 0.0
        total_f1 = 0.0
        total_perplexity = 0.0
        batch_latencies = []
        
        print(f"\nEpoch {epoch + 1}/{epochs}")
        
        for batch_idx, batch in enumerate(train_loader):
            batch_start_time = time.time()
            
            # Make sure data is on the correct device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            # Mixed precision with autocast
            with autocast():
                logits, outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss / gradient_accumulation_steps
            
            # Backward pass with mixed precision
            scaler.scale(loss).backward()
            
            # Gradient accumulation to simulate larger batch size
            if (batch_idx + 1) % gradient_accumulation_steps == 0:
                # Gradient clipping for stability
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            
            # Calculate metrics for this batch
            if logits.size(0) > 0:  # Ensure we have valid predictions
                reshaped_logits = logits.view(-1, logits.size(-1))
                reshaped_labels = labels.view(-1)
                accuracy, precision, recall, f1, perplexity = calculate_metrics(reshaped_logits, reshaped_labels)
                
                # Aggregate metrics
                total_accuracy += accuracy
                total_precision += precision
                total_recall += recall
                total_f1 += f1
                total_perplexity += perplexity
            
            # Store sample logits for analysis
            if batch_idx % 10 == 0:
                sample_logits = logits[:5].detach().cpu().numpy() if logits.size(0) >= 5 else logits.detach().cpu().numpy()
                logits_store.append(sample_logits)
            
            # Clear cache for memory efficiency
            if device.type == 'mps':
                torch.mps.empty_cache()
                torch.mps.synchronize()
            elif device.type == 'cuda':
                torch.cuda.empty_cache()
            
            # Capture latency
            batch_latency = time.time() - batch_start_time
            batch_latencies.append(batch_latency)
            running_loss += loss.item() * gradient_accumulation_steps
            
            # Log batch progress
            if batch_idx % 20 == 0:
                writer.add_scalar("BatchLoss/train", loss.item(), epoch * len(train_loader) + batch_idx)
                writer.add_scalar("BatchPerplexity/train", perplexity, epoch * len(train_loader) + batch_idx)
                print(f"Batch {batch_idx}/{len(train_loader)} | Loss: {loss.item():.4f} | "
                      f"Accuracy: {accuracy:.4f} | Recall: {recall:.4f} | Precision: {precision:.4f} | "
                      f"F1: {f1:.4f} | Perplexity: {perplexity:.4f} | Latency: {batch_latency:.3f}s")
                
                # Generate text predictions occasionally for monitoring
                if batch_idx % 60 == 0 and tokenizer is not None:
                    model.eval()
                    with torch.no_grad():
                        # Generate predictions
                        generated_ids = model.t5.generate(
                            input_ids=input_ids[:1],  # Just use first example
                            attention_mask=attention_mask[:1],
                            max_length=8,  # Short for class labels
                            num_beams=2,
                            early_stopping=True
                        )
                        
                        # Decode prediction
                        prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
                        
                        # Get true label
                        label_ids = labels[0].clone()
                        label_ids[label_ids == -100] = tokenizer.pad_token_id
                        true_label = tokenizer.decode(label_ids, skip_special_tokens=True)
                        
                        print(f"Example: Predicted '{prediction}' | True '{true_label}'")
                    model.train()
        
        # Epoch-level metrics
        epoch_latency = time.time() - start_time
        avg_loss = running_loss / len(train_loader)
        avg_accuracy = total_accuracy / len(train_loader)
        avg_precision = total_precision / len(train_loader)
        avg_recall = total_recall / len(train_loader)
        avg_f1 = total_f1 / len(train_loader)
        avg_perplexity = total_perplexity / len(train_loader)
        avg_batch_latency = np.mean(batch_latencies)
        
        epoch_losses.append(avg_loss)
        
        # Store metrics for CSV logging
        metrics_data.append([
            epoch + 1, 
            avg_loss, 
            avg_perplexity,
            avg_accuracy, 
            avg_precision, 
            avg_recall, 
            avg_f1,
            epoch_latency
        ])
        
        # Print epoch summary
        print(f"Epoch {epoch+1} Summary:")
        print(f"  Loss: {avg_loss:.4f}")
        print(f"  Perplexity: {avg_perplexity:.4f}")
        print(f"  Accuracy: {avg_accuracy:.4f}")
        print(f"  Precision: {avg_precision:.4f}")
        print(f"  Recall: {avg_recall:.4f}")
        print(f"  F1 Score: {avg_f1:.4f}")
        print(f"  Avg Batch Latency: {avg_batch_latency:.3f}s")
        print(f"  Epoch Time: {epoch_latency:.2f}s")
        
        # Log metrics to TensorBoard
        writer.add_scalar("Loss/train", avg_loss, epoch)
        writer.add_scalar("Perplexity/train", avg_perplexity, epoch)
        writer.add_scalar("Accuracy/train", avg_accuracy, epoch)
        writer.add_scalar("Precision/train", avg_precision, epoch)
        writer.add_scalar("Recall/train", avg_recall, epoch)
        writer.add_scalar("F1/train", avg_f1, epoch)
        writer.add_scalar("Time/Epoch", epoch_latency, epoch)
    
    # Save model KPIs as CSV
    import pandas as pd
    metrics_df = pd.DataFrame(metrics_data, columns=[
        "Epoch", "Loss", "Perplexity", "Accuracy", "Precision", "Recall", "F1", "Time (s)"
    ])
    metrics_df.to_csv(os.path.join(model_dir, "training_metrics.csv"), index=False)
    
    # Save logits for further analysis
    np.save(os.path.join(model_dir, "logits_samples.npy"), np.array(logits_store))
    
    # Save training loss curve
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, epochs + 1), epoch_losses, marker='o', linestyle='-', color='b')
    plt.title('Training Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    loss_plot_path = os.path.join(model_dir, "training_loss.png")
    plt.savefig(loss_plot_path)
    writer.add_figure("Training Loss", plt.gcf(), close=True)
    
    # Save model weights
    model_path = os.path.join(model_dir, f"{model_alias}.pth")
    torch.save(model.state_dict(), model_path)
    
    # Save PEFT model - this retains LoRA adapters
    peft_model_path = os.path.join(model_dir, "peft_model")
    model.t5.save_pretrained(peft_model_path)
    
    # Restore original forward method
    model.forward = original_forward
    
    writer.flush()
    writer.close()
    
    print(f"Training complete! Model and metrics saved to {model_dir}")
    return model

In [80]:
def evaluate_model(model, test_loader, tokenizer, label_encoder, model_alias):
    """
    Evaluates the T5 model with improved memory handling and metrics calculation.
    Saves metrics, logs, and confusion matrix to the specified model directory.
    
    Args:
        model: The T5 model with LoRA adapters
        test_loader: DataLoader for test data
        tokenizer: The T5 tokenizer
        label_encoder: Encoder to map between labels and indices
        model_alias: Identifier for the model (used for saving)
        
    Returns:
        class_metrics: DataFrame with per-class metrics
        cm_df: DataFrame with confusion matrix
    """
    import os
    import time
    import torch
    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    from torch.utils.tensorboard import SummaryWriter
    from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
    from collections import defaultdict
    
    # Device setup with proper fallback
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple MPS GPU for evaluation")
        os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'  # Prevent OOM errors
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using NVIDIA CUDA GPU for evaluation")
    else:
        device = torch.device("cpu")
        print("Using CPU for evaluation")
    
    # Additional MPS-specific configurations
    if device.type == 'mps':
        model = model.to(device, dtype=torch.float32)
        torch.set_default_dtype(torch.float32)
    else:
        model = model.to(device)
    
    model.eval()
    
    # Create directory for storing model metrics
    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)
    
    # Initialize TensorBoard writer
    writer = SummaryWriter(log_dir=model_dir)
    
    # Create a robust mapping function between predicted text and actual labels
    # This helps in handling slightly different text predictions
    def robust_label_matching(prediction, label_encoder):
        """Maps prediction text to closest label class"""
        classes = label_encoder.classes_
        prediction = prediction.lower().strip()
        
        # Direct match
        for idx, label in enumerate(classes):
            if prediction == label.lower():
                return idx
        
        # Fuzzy matching - find closest label
        closest_match = None
        highest_similarity = 0
        
        for idx, label in enumerate(classes):
            # Simple overlap check
            if prediction in label.lower() or label.lower() in prediction:
                similarity = len(set(prediction) & set(label.lower())) / max(len(prediction), len(label.lower()))
                if similarity > highest_similarity:
                    highest_similarity = similarity
                    closest_match = idx
        
        if closest_match is not None and highest_similarity > 0.5:  # Threshold for similarity
            return closest_match
        
        # If no good match found, return the first label (default)
        return 0
    
    all_predictions = []
    all_true_labels = []
    all_generated_texts = []
    all_true_texts = []
    batch_latencies = []
    generation_latencies = []
    start_time = time.time()
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            batch_start = time.time()
            
            # Move batch to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            # Generate predictions - time this separately
            generation_start = time.time()
            generated_ids = model.t5.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=64,  # Allow for longer generations
                num_beams=4,    # Use beam search for better quality
                early_stopping=True,
                no_repeat_ngram_size=2  # Prevent repetition
            )
            generation_latency = time.time() - generation_start
            generation_latencies.append(generation_latency)
            
            # Decode generated text predictions
            generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            all_generated_texts.extend(generated_texts)
            
            # Get true label texts
            # First replace -100 padding tokens with pad token ID
            label_ids = labels.clone()
            label_ids[label_ids == -100] = tokenizer.pad_token_id
            true_texts = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
            all_true_texts.extend(true_texts)
            
            # Convert to label indices
            for pred_text, true_text in zip(generated_texts, true_texts):
                pred_idx = robust_label_matching(pred_text, label_encoder)
                
                # Find true label index
                try:
                    true_idx = label_encoder.transform([true_text.strip()])[0]
                except:
                    # If true text not in encoder, find closest match
                    true_idx = robust_label_matching(true_text, label_encoder)
                
                all_predictions.append(pred_idx)
                all_true_labels.append(true_idx)
            
            # Clear memory cache
            if device.type == 'mps':
                torch.mps.empty_cache()
                torch.mps.synchronize()
            elif device.type == 'cuda':
                torch.cuda.empty_cache()
            
            # Track batch latency
            batch_latency = time.time() - batch_start
            batch_latencies.append(batch_latency)
            
            # Log progress
            if batch_idx % 10 == 0:
                print(f"Evaluating batch {batch_idx}/{len(test_loader)} - "
                      f"Generation latency: {generation_latency:.3f}s, "
                      f"Batch latency: {batch_latency:.3f}s")
                
                # Print a few examples
                if batch_idx % 20 == 0:
                    for i in range(min(2, len(generated_texts))):
                        print(f"  Example {i}: Predicted '{generated_texts[i]}' | True '{true_texts[i]}'")
    
    # Overall evaluation time
    eval_time = time.time() - start_time
    class_names = label_encoder.classes_
    
    # Calculate detailed metrics if we have valid predictions
    if len(all_predictions) == 0 or len(all_true_labels) == 0:
        print("No valid predictions found. Check model outputs and label mapping.")
        return None, None
    
    # Analysis of prediction texts vs. true texts
    prediction_analysis = defaultdict(int)
    for pred, true in zip(all_generated_texts, all_true_texts):
        key = f"{true} → {pred}"
        prediction_analysis[key] += 1
    
    # Get top 10 most common predictions 
    top_predictions = sorted(prediction_analysis.items(), key=lambda x: x[1], reverse=True)[:10]
    print("\nTop 10 predictions:")
    for item, count in top_predictions:
        print(f"  {item}: {count}")
    
    # Compute standard metrics
    precision, recall, f1, support = precision_recall_fscore_support(
        all_true_labels, all_predictions, average=None, labels=range(len(class_names)), zero_division=0
    )
    
    # Create per-class metrics DataFrame
    class_metrics = pd.DataFrame({
        'Class': class_names,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Support': support
    })
    
    # Calculate overall metrics
    overall_precision, overall_recall, overall_f1, _ = precision_recall_fscore_support(
        all_true_labels, all_predictions, average='weighted', zero_division=0
    )
    
    # Calculate accuracy
    from sklearn.metrics import accuracy_score
    overall_accuracy = accuracy_score(all_true_labels, all_predictions)
    
    # Print classification report
    from sklearn.metrics import classification_report
    print("\nClassification Report:\n", 
          classification_report(all_true_labels, all_predictions, target_names=class_names, zero_division=0))
    
    # Generate confusion matrix
    cm = confusion_matrix(all_true_labels, all_predictions, labels=range(len(class_names)))
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
    
    # Calculate avg/max latencies
    avg_batch_latency = np.mean(batch_latencies)
    avg_generation_latency = np.mean(generation_latencies)
    max_batch_latency = np.max(batch_latencies)
    max_generation_latency = np.max(generation_latencies)
    
    # Plot confusion matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    
    # Save confusion matrix plot
    confusion_matrix_path = os.path.join(model_dir, "confusion_matrix.png")
    plt.savefig(confusion_matrix_path)
    writer.add_figure("Confusion Matrix", plt.gcf(), close=True)
    
    # Plot metrics per class
    plt.figure(figsize=(14, 7))
    class_metrics[['Class', 'Precision', 'Recall', 'F1-Score']].set_index('Class').plot(kind='bar')
    plt.title('Performance Metrics by Class')
    plt.ylabel('Score')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.ylim(0, 1.0)
    plt.tight_layout()
    metrics_path = os.path.join(model_dir, "class_metrics.png")
    plt.savefig(metrics_path)
    writer.add_figure("Class Metrics", plt.gcf(), close=True)
    
    # Print overall metrics
    print("\nPer-class Metrics Summary:\n", class_metrics.to_string(index=False))
    print(f"\nOverall Metrics:")
    print(f"  Accuracy: {overall_accuracy:.4f}")
    print(f"  Precision: {overall_precision:.4f}")
    print(f"  Recall: {overall_recall:.4f}")
    print(f"  F1-score: {overall_f1:.4f}")
    print(f"  Eval Time: {eval_time:.2f}s")
    print(f"  Avg Batch Latency: {avg_batch_latency:.4f}s")
    print(f"  Avg Generation Latency: {avg_generation_latency:.4f}s")
    print(f"  Max Batch Latency: {max_batch_latency:.4f}s")
    print(f"  Max Generation Latency: {max_generation_latency:.4f}s")
    
    # Log metrics to TensorBoard
    writer.add_scalar("Accuracy/test", overall_accuracy)
    writer.add_scalar("Precision/test", overall_precision)
    writer.add_scalar("Recall/test", overall_recall)
    writer.add_scalar("F1-score/test", overall_f1)
    writer.add_scalar("EvaluationTime/test", eval_time)
    writer.add_scalar("BatchLatency/test", avg_batch_latency)
    writer.add_scalar("GenerationLatency/test", avg_generation_latency)
    
    # Log per-class metrics
    for i, class_name in enumerate(class_names):
        writer.add_scalar(f"Precision/{class_name}", precision[i])
        writer.add_scalar(f"Recall/{class_name}", recall[i])
        writer.add_scalar(f"F1-score/{class_name}", f1[i])
    
    # Save text predictions for error analysis
    predictions_df = pd.DataFrame({
        'True_Label': all_true_texts,
        'Predicted_Text': all_generated_texts,
        'True_Index': all_true_labels,
        'Predicted_Index': all_predictions,
        'Correct': [p == t for p, t in zip(all_predictions, all_true_labels)]
    })
    predictions_df.to_csv(os.path.join(model_dir, "predictions.csv"), index=False)
    
    # Save evaluation metrics
    class_metrics.to_csv(os.path.join(model_dir, "class_metrics.csv"), index=False)
    cm_df.to_csv(os.path.join(model_dir, "confusion_matrix.csv"))
    
    # Save latency metrics
    latency_df = pd.DataFrame({
        'Metric': ['Eval Time', 'Avg Batch Latency', 'Avg Generation Latency', 
                  'Max Batch Latency', 'Max Generation Latency'],
        'Value': [eval_time, avg_batch_latency, avg_generation_latency, 
                 max_batch_latency, max_generation_latency]
    })
    latency_df.to_csv(os.path.join(model_dir, "latency_metrics.csv"), index=False)
    
    # Save overall metrics
    overall_metrics_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
        'Value': [overall_accuracy, overall_precision, overall_recall, overall_f1]
    })
    overall_metrics_df.to_csv(os.path.join(model_dir, "overall_metrics.csv"), index=False)
    
    writer.flush()
    writer.close()
    
    print(f"\nEvaluation complete! Results saved to {model_dir}")
    return class_metrics, cm_df

In [117]:
def export_to_onnx(model, tokenizer, model_alias):
    """Exports the model to ONNX format with proper handling for T5 models."""
    print("Starting ONNX export process...")
    
    try:
        # Move model to CPU for export
        model = model.eval().to("cpu")
        
        # Create sample input
        prefix = "classify: "
        text = prefix + "This is a sample test input"
        
        # Tokenize inputs
        sample_inputs = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )
        
        # Create model directory
        model_dir = f"model-metric/{model_alias}"
        os.makedirs(model_dir, exist_ok=True)
        onnx_path = os.path.join(model_dir, f"{model_alias}.onnx")
        
        # Create a custom wrapper class to handle T5 complexity
        class T5GenerationWrapper(torch.nn.Module):
            def __init__(self, t5_model, config):
                super(T5GenerationWrapper, self).__init__()
                self.model = t5_model
                self.config = config
                
            def forward(self, input_ids, attention_mask):
                # Create decoder_input_ids
                batch_size = input_ids.shape[0]
                decoder_input_ids = torch.ones(
                    (batch_size, 1), 
                    dtype=torch.long
                ) * self.config.decoder_start_token_id
                
                # For ONNX export, we need to do a forward pass, not generation
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    decoder_input_ids=decoder_input_ids,
                    return_dict=True
                )
                
                # Return logits tensor
                return outputs.logits
        
        print("Creating T5 ONNX wrapper...")
        
        # Get the T5 config
        config = model.t5.config
        
        # Create the wrapper with the t5 model and its config
        if hasattr(model, 't5'):
            t5_model = model.t5
        else:
            t5_model = model  # If it's already the T5 model
            
        onnx_model = T5GenerationWrapper(t5_model, config)
        
        print("Beginning ONNX export...")
        
        # Define input and output names
        input_names = ["input_ids", "attention_mask"]
        output_names = ["logits"]
        
        # Export to ONNX
        torch.onnx.export(
            onnx_model,
            (sample_inputs["input_ids"], sample_inputs["attention_mask"]),
            onnx_path,
            input_names=input_names,
            output_names=output_names,
            dynamic_axes={
                "input_ids": {0: "batch", 1: "sequence"},
                "attention_mask": {0: "batch", 1: "sequence"},
                "logits": {0: "batch", 1: "sequence", 2: "vocab_size"}
            },
            opset_version=15,  # Use higher opset version for better compatibility
            do_constant_folding=True,
            verbose=True
        )
        
        print(f"ONNX model successfully exported to {onnx_path}")
        return onnx_path
        
    except Exception as e:
        print(f"ONNX export failed with error: {str(e)}")
        traceback_str = traceback.format_exc()
        print(f"Traceback: {traceback_str}")
        
        # Create a minimal dummy ONNX model if export fails
        try:
            # Create a directory for the model
            model_dir = f"model-metric/{model_alias}"
            os.makedirs(model_dir, exist_ok=True)
            
            # Path for the dummy model
            dummy_onnx_path = os.path.join(model_dir, f"{model_alias}_dummy.onnx")
            
            print(f"Creating minimal dummy ONNX model at {dummy_onnx_path}")
            
            # Simple dummy model
            class DummyModel(torch.nn.Module):
                def forward(self, input_ids, attention_mask):
                    # Simple identity operation
                    return torch.zeros((input_ids.shape[0], input_ids.shape[1], 32128), dtype=torch.float32)
            
            dummy_model = DummyModel()
            dummy_inputs = (
                torch.ones((1, 10), dtype=torch.long),
                torch.ones((1, 10), dtype=torch.long)
            )
            
            torch.onnx.export(
                dummy_model,
                dummy_inputs,
                dummy_onnx_path,
                input_names=["input_ids", "attention_mask"],
                output_names=["logits"],
                dynamic_axes={
                    "input_ids": {0: "batch", 1: "sequence"},
                    "attention_mask": {0: "batch", 1: "sequence"},
                    "logits": {0: "batch", 1: "sequence", 2: "vocab_size"}
                },
                opset_version=12
            )
            
            print(f"Created dummy ONNX model at {dummy_onnx_path}")
            return dummy_onnx_path
            
        except Exception as e2:
            print(f"Failed to create dummy ONNX model: {str(e2)}")
            return None

def run_onnx_inference(onnx_path, input_ids, attention_mask):
    """Runs inference using ONNX Runtime with proper error handling."""
    if onnx_path is None:
        print("No valid ONNX model path provided")
        return None
        
    try:
        import traceback
        import numpy as np
        
        print(f"Starting ONNX inference with model: {onnx_path}")
        
        # Verify the file exists
        if not os.path.exists(onnx_path):
            print(f"ONNX model file not found at {onnx_path}")
            return None
            
        # Get available providers
        available_providers = ort.get_available_providers()
        print(f"Available ONNX Runtime providers: {available_providers}")
        
        # Select appropriate providers
        selected_providers = []
        if 'CUDAExecutionProvider' in available_providers:
            selected_providers.append('CUDAExecutionProvider')
        selected_providers.append('CPUExecutionProvider')
        
        print(f"Using providers: {selected_providers}")
        
        # Create ONNX Runtime session
        try:
            ort_session = ort.InferenceSession(onnx_path, providers=selected_providers)
            print("Successfully created ONNX Runtime session")
        except Exception as e:
            print(f"Error creating ONNX Runtime session with specified providers: {e}")
            print("Falling back to default provider configuration")
            ort_session = ort.InferenceSession(onnx_path)
        
        # Print input names expected by the model
        input_names = [input.name for input in ort_session.get_inputs()]
        output_names = [output.name for output in ort_session.get_outputs()]
        print(f"ONNX model expects inputs: {input_names}")
        print(f"ONNX model provides outputs: {output_names}")
        
        # Convert PyTorch tensors to NumPy arrays
        input_ids_np = input_ids.cpu().numpy()
        attention_mask_np = attention_mask.cpu().numpy()
        
        print(f"Input shapes - input_ids: {input_ids_np.shape}, attention_mask: {attention_mask_np.shape}")
        
        # Create input dictionary
        ort_inputs = {
            "input_ids": input_ids_np,
            "attention_mask": attention_mask_np
        }
        
        # Run inference
        print("Running ONNX inference...")
        ort_outputs = ort_session.run(None, ort_inputs)
        
        # Check output
        if not ort_outputs or len(ort_outputs) == 0:
            print("ONNX Runtime returned empty outputs")
            return None
            
        print(f"ONNX output shape: {ort_outputs[0].shape}")
        
        # Convert output back to PyTorch tensor
        return torch.tensor(ort_outputs[0], dtype=torch.float32)
    
    except Exception as e:
        print(f"ONNX inference failed with error: {str(e)}")
        traceback_str = traceback.format_exc()
        print(f"Traceback: {traceback_str}")
        return None

In [118]:
def compare_inference_performance(model, tokenizer, test_df, label_encoder, model_alias):
    """Compares inference performance between PyTorch and ONNX Runtime."""
    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)
    writer = SummaryWriter(log_dir=model_dir)

    # Ensure device is properly defined
    device = torch.device("cuda" if torch.cuda.is_available() else 
                        ("mps" if torch.backends.mps.is_available() else "cpu"))
    
    # Create a small batch for testing
    sample_batch = test_df.sample(min(50, len(test_df)))
    test_dataset_batch = T5Dataset(sample_batch, tokenizer)
    test_loader_batch = DataLoader(test_dataset_batch, batch_size=1, shuffle=False)
    
    # Get a single batch
    batch = next(iter(test_loader_batch))
    
    # Properly extract tensors from the batch dictionary
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    
    # Ensure tensors are the right shape
    print(f"Input IDs shape: {input_ids.shape}")
    print(f"Attention mask shape: {attention_mask.shape}")
    
    # PyTorch Inference
    model.eval()
    model = model.to(device)
    
    start_time_torch = time.time()
    with torch.no_grad():
        # For T5 models, we need to create decoder inputs
        # Create a decoder_input_ids tensor starting with the model's decoder_start_token_id
        # Usually for T5 this is the pad token id
        decoder_input_ids = torch.ones(
            (input_ids.shape[0], 1), 
            dtype=torch.long, 
            device=device
        ) * model.t5.config.decoder_start_token_id
        
        # Use generate() instead of forward() for inference with T5
        generated_ids = model.t5.generate(
            input_ids=input_ids.to(device),
            attention_mask=attention_mask.to(device),
            max_length=32,
            num_beams=4,
            early_stopping=True
        )
        
        # Get predictions by decoding the generated ids
        predictions = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        print(f"Generated predictions: {predictions}")
        
        # For analysis purposes, try to get logits from a forward pass if needed
        try:
            # Try with decoder_input_ids explicitly specified
            outputs = model.t5(
                input_ids=input_ids.to(device),
                attention_mask=attention_mask.to(device),
                decoder_input_ids=decoder_input_ids
            )
            logits = outputs.logits if hasattr(outputs, 'logits') else outputs[0]
            logits = logits.to('cpu')
        except Exception as e:
            print(f"Warning: Could not get logits: {e}")
            # Create dummy logits for timing purposes
            logits = torch.zeros((input_ids.shape[0], 1, model.t5.config.vocab_size), device='cpu')
        
    latency_torch = time.time() - start_time_torch
    throughput_torch = len(sample_batch) / latency_torch

    # ONNX Inference - This part needs special handling due to T5 model complexity
    try:
        onnx_path = export_to_onnx(model, tokenizer, model_alias)
        start_time_onnx = time.time()
        onnx_outputs = run_onnx_inference(onnx_path, input_ids, attention_mask)
        latency_onnx = time.time() - start_time_onnx
        throughput_onnx = len(sample_batch) / latency_onnx
        print(f"ONNX Inference - Latency: {latency_onnx:.4f}s, Throughput: {throughput_onnx:.2f} samples/s")
    except Exception as e:
        print(f"ONNX export/inference failed: {e}")
        latency_onnx = float('inf')
        throughput_onnx = 0
        onnx_outputs = None

    print(f"PyTorch Inference - Latency: {latency_torch:.4f}s, Throughput: {throughput_torch:.2f} samples/s")
    
    # Modified prediction handling to accommodate T5 model
    if onnx_outputs is not None:
        # For PyTorch predictions
        if hasattr(model, 'predict'):
            # Use custom predict method if available 
            torch_preds = model.predict(
                input_ids=input_ids.to(device),
                attention_mask=attention_mask.to(device),
                tokenizer=tokenizer
            )
            torch_preds = [label_encoder.transform([pred.strip()])[0] for pred in torch_preds]
        else:
            # Fall back to argmax if predict not available
            torch_preds = torch.argmax(logits, dim=-1).tolist()
            
        # For ONNX predictions (if successful)
        onnx_preds = torch.argmax(onnx_outputs, dim=-1).tolist()
        
        # Getting actual labels requires decoding the T5 label IDs
        label_ids = labels.clone()
        label_ids[label_ids == -100] = tokenizer.pad_token_id
        actual_texts = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
        actual_labels = [label_encoder.transform([text.strip()])[0] for text in actual_texts]
        
        # Compare predictions if we have ONNX results
        try:
            torch_report = classification_report(actual_labels, torch_preds, 
                                               target_names=label_encoder.classes_, 
                                               output_dict=True)
            onnx_report = classification_report(actual_labels, onnx_preds, 
                                              target_names=label_encoder.classes_, 
                                              output_dict=True)
            
            torch_df = pd.DataFrame(torch_report).transpose()
            onnx_df = pd.DataFrame(onnx_report).transpose()
            
            torch_df.to_csv(os.path.join(model_dir, "torch_classification_report.csv"))
            onnx_df.to_csv(os.path.join(model_dir, "onnx_classification_report.csv"))
        except Exception as e:
            print(f"Error generating classification reports: {e}")
            torch_df = pd.DataFrame()
            onnx_df = pd.DataFrame()
    else:
        # Only compute PyTorch metrics if ONNX failed
        if hasattr(model, 'predict'):
            torch_preds = model.predict(
                input_ids=input_ids.to(device),
                attention_mask=attention_mask.to(device),
                tokenizer=tokenizer
            )
            torch_preds = [label_encoder.transform([pred.strip()])[0] for pred in torch_preds]
        else:
            torch_preds = torch.argmax(logits, dim=-1).tolist()
            
        label_ids = labels.clone()
        label_ids[label_ids == -100] = tokenizer.pad_token_id
        actual_texts = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
        actual_labels = [label_encoder.transform([text.strip()])[0] for text in actual_texts]
        
        try:
            torch_report = classification_report(actual_labels, torch_preds, 
                                               target_names=label_encoder.classes_, 
                                               output_dict=True)
            torch_df = pd.DataFrame(torch_report).transpose()
            torch_df.to_csv(os.path.join(model_dir, "torch_classification_report.csv"))
            onnx_df = pd.DataFrame()
        except Exception as e:
            print(f"Error generating PyTorch classification report: {e}")
            torch_df = pd.DataFrame()
            onnx_df = pd.DataFrame()
    
    # Log metrics to TensorBoard
    writer.add_scalar("Latency/PyTorch", latency_torch)
    writer.add_scalar("Throughput/PyTorch", throughput_torch)
    
    if onnx_outputs is not None:
        writer.add_scalar("Latency/ONNX", latency_onnx)
        writer.add_scalar("Throughput/ONNX", throughput_onnx)
    
    writer.flush()
    writer.close()
    
    return torch_df, onnx_df

In [119]:
def compute_class_weights(labels, num_classes):
    counter = Counter(labels)
    total_samples = len(labels)
    weights = [total_samples / (num_classes * counter[i]) for i in range(num_classes)]
    return torch.tensor(weights, dtype=torch.float)

In [120]:
MODEL_NAME = "google-t5/t5-small"
model_alias = 't5-small-lora'
update_model_dict(model_alias, MODEL_NAME)

In [121]:
print(f"device: {get_device()}")

Using Apple MPS GPU
device: mps


In [122]:
df, label_encoder = load_and_preprocess_data()
balanced_df = balance_dataset(df)
balanced_df['conversation'] = balanced_df['conversation'].apply(preprocess_conversation)

Original distribution:
 issue_area
Cancellations and returns    286
Order                        270
Login and Account            151
Shopping                     116
Warranty                     105
Shipping                      72
Name: count, dtype: int64


In [123]:
df.columns

Index(['conversation', 'issue_area', 'labels'], dtype='object')

In [124]:
# Tokenization and DataLoaders
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [125]:
# Create DataLoaders
train_loader, test_loader, test_df = create_dataloaders(balanced_df, tokenizer)

In [126]:
# Model Initialization and Training
num_classes = len(label_encoder.classes_)
model = T5SmallWithLoRA()

class_weights = compute_class_weights(balanced_df['labels'], num_classes)

In [90]:
trained_model = train_model(
    model=model,
    train_loader=train_loader,
    model_alias=model_alias,
    epochs=10,
    learning_rate=5e-5,
    tokenizer=tokenizer
)

Using Apple MPS GPU

Epoch 1/10
Batch 0/57 | Loss: 2.1604 | Accuracy: 0.0294 | Recall: 0.0294 | Precision: 0.0588 | F1: 0.0392 | Perplexity: 15.3601 | Latency: 0.714s
Example: Predicted '' | True 'Shopping'
Batch 20/57 | Loss: 1.8366 | Accuracy: 0.0323 | Recall: 0.0323 | Precision: 0.0323 | F1: 0.0323 | Perplexity: 16.1005 | Latency: 0.408s
Batch 40/57 | Loss: 1.7740 | Accuracy: 0.0000 | Recall: 0.0000 | Precision: 0.0000 | F1: 0.0000 | Perplexity: 32.5937 | Latency: 0.415s
Epoch 1 Summary:
  Loss: 7.7636
  Perplexity: 27.9454
  Accuracy: 0.0517
  Precision: 0.0951
  Recall: 0.0517
  F1 Score: 0.0621
  Avg Batch Latency: 0.422s
  Epoch Time: 25.03s

Epoch 2/10
Batch 0/57 | Loss: 1.4565 | Accuracy: 0.0500 | Recall: 0.0500 | Precision: 0.0250 | F1: 0.0333 | Perplexity: 80.8076 | Latency: 0.462s
Example: Predicted '' | True 'Shipping'
Batch 20/57 | Loss: 1.1078 | Accuracy: 0.2500 | Recall: 0.2500 | Precision: 0.4500 | F1: 0.3167 | Perplexity: 33.3952 | Latency: 0.389s
Batch 40/57 | Loss: 

In [112]:
    # Evaluate model
print("Evaluating model...")
class_metrics, cm = evaluate_model(
    trained_model, 
    test_loader, 
    tokenizer, 
    label_encoder, 
    model_alias
)

Evaluating model...
Using Apple MPS GPU for evaluation
Evaluating batch 0/19 - Generation latency: 3.323s, Batch latency: 3.332s
  Example 0: Predicted 'Wendendtrutrues towstwiss-truth-to-read-in-re-ordering-where-saved-wift-all-Ge-distribution-me-first-ever-detail-' | True 'Login and Account'
  Example 1: Predicted 'Login and Account' | True 'Login and Account'
Evaluating batch 10/19 - Generation latency: 2.827s, Batch latency: 2.831s

Top 10 predictions:
  Login and Account → Login and Account: 21
  Warranty → Warranty: 17
  Shipping → Shipping: 16
  Order → Order: 13
  Shopping → Shopping: 13
  Cancellations and returns → Cancellations and returns: 10
  Order → Shopping: 6
  Cancellations and returns → Login and Account: 3
  Shopping → Shopping Shopping: 3
  Cancellations and returns → Order: 3

Classification Report:
                            precision    recall  f1-score   support

Cancellations and returns       0.26      0.70      0.38        20
        Login and Account      

<Figure size 1400x700 with 0 Axes>

In [128]:
compare_inference_performance(trained_model, tokenizer, test_df, label_encoder, model_alias=model_alias)

Input IDs shape: torch.Size([1, 512])
Attention mask shape: torch.Size([1, 512])
Generated predictions: ['']
Starting ONNX export process...
Creating T5 ONNX wrapper...
Beginning ONNX export...
ONNX model successfully exported to model-metric/t5-small-lora/t5-small-lora.onnx
Starting ONNX inference with model: model-metric/t5-small-lora/t5-small-lora.onnx
Available ONNX Runtime providers: ['CoreMLExecutionProvider', 'AzureExecutionProvider', 'CPUExecutionProvider']
Using providers: ['CPUExecutionProvider']
Successfully created ONNX Runtime session
ONNX model expects inputs: ['input_ids', 'attention_mask']
ONNX model provides outputs: ['logits']
Input shapes - input_ids: (1, 512), attention_mask: (1, 512)
Running ONNX inference...
ONNX output shape: (1, 1, 32128)
ONNX Inference - Latency: 0.4167s, Throughput: 119.99 samples/s
PyTorch Inference - Latency: 3.3246s, Throughput: 15.04 samples/s


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [114]:
# Save tokenizer
tokenizer_path = f"model-metric/{model_alias}/tokenizer/"
print(f"Saving tokenizer to {tokenizer_path}")
tokenizer.save_pretrained(tokenizer_path)

Saving tokenizer to model-metric/t5-small-lora/tokenizer/


('model-metric/t5-small-lora/tokenizer/tokenizer_config.json',
 'model-metric/t5-small-lora/tokenizer/special_tokens_map.json',
 'model-metric/t5-small-lora/tokenizer/spiece.model',
 'model-metric/t5-small-lora/tokenizer/added_tokens.json',
 'model-metric/t5-small-lora/tokenizer/tokenizer.json')

In [115]:
print(f"Training and evaluation completed. Model and metrics saved in model-metric/{model_alias}/")

Training and evaluation completed. Model and metrics saved in model-metric/t5-small-lora/


In [None]:
df['conversation'].str.(

In [None]:
df['issue_area'].unique()