In [1]:
# pip install onnxruntime fastapi

In [73]:
# --- Common Utilities and Setup ---
import os
import json
import torch
import transformers
import accelerate
import huggingface_hub
import peft
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup
from sklearn.utils import resample
from collections import Counter
import time
import onnxruntime as ort
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List
import pickle


import torch
import torch.nn as nn
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import time
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

import warnings

print("peft:", peft.__version__)
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("Accelerate:", accelerate.__version__)
print("Huggingface Hub:", huggingface_hub.__version__)

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

peft: 0.15.0
Torch: 2.6.0
Transformers: 4.50.0
Accelerate: 1.5.2
Huggingface Hub: 0.29.3


In [74]:
# Device Selection Function
def get_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using Apple MPS GPU")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using NVIDIA CUDA GPU")
    else:
        device = torch.device("cpu")
        print("Using CPU")
    return device

In [75]:
def update_model_dict(model_alias, MODEL_NAME):
    if not os.path.exists('model_dict.json'):
        model_dict = {}
    else:
        with open('model_dict.json', 'r') as file:
            model_dict = json.load(file)

    model_dict[model_alias] = MODEL_NAME

    with open('model_dict.json', 'w') as file:
        json.dump(model_dict, file)

In [76]:
def load_and_preprocess_data(filepath="train-00000-of-00001-a5a7c6e4bb30b016.parquet"):
    """Loads and preprocesses the dataset."""
    df = pd.read_parquet(filepath)
    df = df[['conversation', 'issue_area']]
    print("Original distribution:\n", df['issue_area'].value_counts())
    label_encoder = LabelEncoder()
    df["labels"] = label_encoder.fit_transform(df["issue_area"])

    #saving Label-encoder
    label_encoder_path = f"model-metric/{model_alias}/label_encoder.pkl"
    os.makedirs(os.path.dirname(label_encoder_path), exist_ok=True)
    with open(label_encoder_path, "wb") as f:
        pickle.dump(label_encoder, f)

    return df, label_encoder

In [77]:
def balance_dataset(df, max_count=100, random_state=42):
    """Balances the dataset using oversampling."""
    balanced_df = pd.DataFrame()
    for issue in df['issue_area'].unique():
        subset = df[df['issue_area'] == issue]
        balanced_subset = resample(subset, replace=True, n_samples=max_count, random_state=random_state)
        balanced_df = pd.concat([balanced_df, balanced_subset])
    return balanced_df.sample(frac=1, random_state=random_state).reset_index(drop=True)


In [78]:
def preprocess_conversation(conversation):
    """Preprocesses a conversation."""
    if isinstance(conversation, list):
        return " ".join([turn.get('text', '') for turn in conversation if isinstance(turn, dict)])
    return str(conversation) #.lower()

In [79]:
class CustomT5Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_source_length=300, max_target_length=10):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        conversation = row["conversation"]
        label = row["labels"]

        # Convert label to text representation
        label_text = label_encoder.inverse_transform([label])[0]

        # Prepare source (conversation) and target (intent) encodings
        source_encoding = self.tokenizer(
            conversation,
            max_length=self.max_source_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        target_encoding = self.tokenizer(
            label_text,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": source_encoding["input_ids"].squeeze(),
            "attention_mask": source_encoding["attention_mask"].squeeze(),
            "labels": target_encoding["input_ids"].squeeze()
        }

In [80]:
def create_dataloaders(df, tokenizer, batch_size=8, train_ratio=0.75):
    """Creates train and test DataLoaders."""
    train_size = int(train_ratio * len(df))
    train_df, test_df = df[:train_size], df[train_size:]
    train_dataset = CustomT5Dataset(train_df, tokenizer)
    test_dataset = CustomT5Dataset(test_df, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader, test_df

In [81]:
class FlanT5WithLoRA(nn.Module):
    def __init__(self, num_labels, lora_r=8, lora_alpha=16, lora_dropout=0.1):
        super(FlanT5WithLoRA, self).__init__()
        # Load the base T5 model
        self.t5 = T5ForConditionalGeneration.from_pretrained(
            "google/flan-t5-base"
        )

        # LoRA Configuration
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM,
            r=lora_r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            # target_modules=["q", "k", "v", "o"]
            target_modules=["q",  "v"]
        )
        self.t5 = get_peft_model(self.t5, lora_config)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.t5(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return outputs


In [82]:
# Function to compute class weights
def compute_class_weights(labels, num_classes):
    counter = Counter(labels)
    total_samples = len(labels)
    weights = [total_samples / (num_classes * counter[i]) for i in range(num_classes)]
    return torch.tensor(weights, dtype=torch.float)

In [83]:
def train_model(model, train_loader, tokenizer, model_alias, epochs=3, learning_rate=5e-5):
    """Trains the T5 model and saves logs, metrics, and model weights."""

    device = get_device()

        # Additional MPS-specific configurations
    if device.type == 'mps':
        # Ensure float32 dtype for MPS compatibility
        model = model.to(device, dtype=torch.float32)

        # Some tensor operations may need explicit conversion
        torch.set_default_dtype(torch.float32)
    else:
        model = model.to(device)

    model.train()

    # Create directory for storing model metrics
    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)

    # TensorBoard writer in the model directory
    writer = SummaryWriter(log_dir=model_dir)

    # Set up optimizer and learning rate scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    epoch_losses = []
    metrics_data = []


    # Predefined mapping if needed
    label_mapping = {
        label.lower(): label for label in label_encoder.classes_
    }

    for epoch in range(epochs):
        start_time = time.time()
        total_loss = 0
        all_preds, all_labels = [], []

        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

            # Predict intents by generating text
            with torch.no_grad():
                generated_ids = model.t5.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=128
                )

                # Decode generated texts
                preds = [tokenizer.decode(g, skip_special_tokens=True).lower().strip() for g in generated_ids]
                # true_labels = [tokenizer.decode(l, skip_special_tokens=True).lower().strip() for l in labels]
                # true_labels = [tokenizer.decode(l.tolist(), skip_special_tokens=True).lower().strip() for l in labels]
                true_labels = [tokenizer.decode(l, skip_special_tokens=True).split()[0].lower().strip() for l in labels.cpu().tolist()]



                # Robust label matching
                def match_label(pred):
                    # Exact match
                    if pred in label_mapping:
                        return label_mapping[pred]

                    # Partial match
                    for label, mapped_label in label_mapping.items():
                        if pred in label or label in pred:
                            return mapped_label

                    # Fallback to the first label if no match
                    return label_encoder.classes_[0]

                # Convert predictions and true labels
                pred_indices = [label_encoder.transform([match_label(p)])[0] for p in preds]
                # true_indices = labels.cpu().tolist()
                # true_indices = [label_encoder.transform([match_label(tokenizer.decode(l[0], skip_special_tokens=True).lower().strip())])[0] for l in labels.cpu().tolist()]
                true_indices = [label_encoder.transform([match_label(t)])[0] for t in true_labels]


                all_preds.extend(pred_indices)
                all_labels.extend(true_indices)

            print(f"all_preds type: {type(all_preds)}, shape: {len(all_preds)}, example: {all_preds[:5]}")
            print(f"all_labels type: {type(all_labels)}, shape: {len(all_labels)}, example: {all_labels[:5]}")

            # Log batch loss every 10 batches
            if batch_idx % 10 == 0:
                writer.add_scalar("BatchLoss/train", loss.item(), epoch * len(train_loader) + batch_idx)

        # Rest of the function remains the same...

        # Compute epoch metrics
        avg_loss = total_loss / len(train_loader)
        epoch_losses.append(avg_loss)
        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
        epoch_time = time.time() - start_time

        # Store metrics for CSV logging
        metrics_data.append([epoch + 1, avg_loss, accuracy, precision, recall, f1, epoch_time])

        # Print metrics
        print(f"Epoch {epoch+1}: Loss={avg_loss:.4f}, Accuracy={accuracy:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, F1-score={f1:.4f}, Time={epoch_time:.2f}s")

        # Log metrics to TensorBoard
        writer.add_scalar("Loss/train", avg_loss, epoch)
        writer.add_scalar("Accuracy/train", accuracy, epoch)
        writer.add_scalar("Precision/train", precision, epoch)
        writer.add_scalar("Recall/train", recall, epoch)
        writer.add_scalar("F1-score/train", f1, epoch)
        writer.add_scalar("Time/Epoch", epoch_time, epoch)

    # Save model KPIs as CSV
    metrics_df = pd.DataFrame(metrics_data, columns=["Epoch", "Loss", "Accuracy", "Precision", "Recall", "F1-score", "Time (s)"])
    metrics_df.to_csv(os.path.join(model_dir, "training_metrics.csv"), index=False)

    # Save training loss curve
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, epochs + 1), epoch_losses, marker='o', linestyle='-', color='b')
    plt.title('Training Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    loss_plot_path = os.path.join(model_dir, "training_loss.png")
    plt.savefig(loss_plot_path)
    writer.add_figure("Training Loss", plt.gcf(), close=True)

    # Save model weights
    model_path = os.path.join(model_dir, f"{model_alias}.pth")
    torch.save(model.state_dict(), model_path)

    writer.flush()
    writer.close()
    return model


In [84]:
import os
import time
import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

def evaluate_model(model, test_loader, tokenizer, label_encoder, model_alias):
    """Evaluates the T5 model and saves metrics, logs, and confusion matrix."""

    device = get_device()

        # Additional MPS-specific configurations
    if device.type == 'mps':
        model = model.to(device, dtype=torch.float32)
        torch.set_default_dtype(torch.float32)
    else:
        model = model.to(device)

    model.eval()

    # Predefined mapping
    label_mapping = {
        label.lower(): label for label in label_encoder.classes_
    }

    # Robust label matching function
    def match_label(pred):
        # Exact match
        if pred in label_mapping:
            return label_mapping[pred]

        # Partial match
        for label, mapped_label in label_mapping.items():
            if pred in label or label in pred:
                return mapped_label

        # Fallback to the first label if no match
        return label_encoder.classes_[0]

    # Create directory for storing model metrics
    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)

    # Initialize TensorBoard writer
    writer = SummaryWriter(log_dir=model_dir)

    all_preds, all_labels = [], []
    start_time = time.time()

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Generate predictions
            generated_ids = model.t5.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128
            )

            # Decode generated texts
            preds = [tokenizer.decode(g, skip_special_tokens=True).lower().strip() for g in generated_ids]
            true_labels = [tokenizer.decode(l, skip_special_tokens=True).lower().strip() for l in labels]

            # Convert predicted and true intents to indices
            pred_indices = [label_encoder.transform([match_label(p)])[0] for p in preds]
            true_indices = [label_encoder.transform([match_label(t)])[0] for t in true_labels]

            all_preds.extend(pred_indices)
            all_labels.extend(true_indices)

    eval_time = time.time() - start_time
    class_names = label_encoder.classes_

    # Compute metrics
    precision, recall, f1, support = precision_recall_fscore_support(all_labels, all_preds, average=None)
    class_metrics = pd.DataFrame({
        'Class': class_names,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Support': support
    })

    overall_precision, overall_recall, overall_f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    # Print and save classification report
    print("\nClassification Report:\n", classification_report(all_labels, all_preds, target_names=class_names))

    # Generate confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)

    # Plot confusion matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()

    # Save confusion matrix plot
    confusion_matrix_path = os.path.join(model_dir, "confusion_matrix.png")
    plt.savefig(confusion_matrix_path)
    writer.add_figure("Confusion Matrix", plt.gcf(), close=True)

    # Print overall metrics
    print("\nPer-class Metrics:\n", class_metrics.to_string(index=False))
    print(f"\nOverall Metrics:\nPrecision: {overall_precision:.4f}, Recall: {overall_recall:.4f}, F1-score: {overall_f1:.4f}, Eval Time: {eval_time:.2f}s")

    # Log metrics to TensorBoard
    writer.add_scalar("Precision/test", overall_precision)
    writer.add_scalar("Recall/test", overall_recall)
    writer.add_scalar("F1-score/test", overall_f1)
    writer.add_scalar("Evaluation Time", eval_time)

    # Log per-class metrics
    for i, class_name in enumerate(class_names):
        writer.add_scalar(f"Precision/{class_name}", precision[i])
        writer.add_scalar(f"Recall/{class_name}", recall[i])
        writer.add_scalar(f"F1-score/{class_name}", f1[i])

    writer.flush()
    writer.close()

    # Save evaluation metrics
    class_metrics.to_csv(os.path.join(model_dir, "class_metrics.csv"), index=False)
    cm_df.to_csv(os.path.join(model_dir, "confusion_matrix.csv"))

    return class_metrics, cm_df


In [85]:
import os
import time
import torch
import numpy as np
import onnxruntime as ort
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from torch.utils.tensorboard import SummaryWriter

def export_to_onnx(model, tokenizer, model_alias):
    """Exports the model to ONNX format."""
    model.eval().to("cpu")
    sample_input = tokenizer("test", return_tensors="pt")
    input_names = ["input_ids", "attention_mask"]
    output_names = ["output"]

    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)
    onnx_path = os.path.join(model_dir, f"{model_alias}.onnx")

    torch.onnx.export(
        model,
        (sample_input["input_ids"], sample_input["attention_mask"]),
        onnx_path,
        input_names=input_names,
        output_names=output_names,
        dynamic_axes={
            "input_ids": {0: "batch", 1: "sequence"},
            "attention_mask": {0: "batch", 1: "sequence"},
            "output": {0: "batch"}
        }
    )
    print(f"ONNX model exported to {onnx_path}")
    return onnx_path

def run_onnx_inference(onnx_path, input_ids, attention_mask):
    """Runs inference using ONNX Runtime."""
    ort_session = ort.InferenceSession(onnx_path)
    ort_inputs = {"input_ids": input_ids.tolist(), "attention_mask": attention_mask.tolist()}
    ort_outputs = ort_session.run(None, ort_inputs)
    return torch.tensor(np.array(ort_outputs[0]), dtype=torch.float32)

def compare_inference_performance(model, tokenizer, test_df, label_encoder, model_alias):
    """Compares inference performance between PyTorch and ONNX Runtime."""
    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)
    writer = SummaryWriter(log_dir=model_dir)

    sample_batch = test_df.sample(50)
    test_dataset_batch = CustomDataset(sample_batch, tokenizer)
    test_loader_batch = DataLoader(test_dataset_batch, batch_size=len(sample_batch), shuffle=False)
    batch = next(iter(test_loader_batch))
    input_ids, attention_mask, labels = batch

    # PyTorch Inference
    model.eval().to(device)
    start_time_torch = time.time()
    with torch.no_grad():
        torch_outputs = model(input_ids.to(device), attention_mask.to(device)).cpu()
    latency_torch = time.time() - start_time_torch
    throughput_torch = len(sample_batch) / latency_torch

    # ONNX Inference
    onnx_path = export_to_onnx(model, tokenizer, model_alias)
    start_time_onnx = time.time()
    onnx_outputs = run_onnx_inference(onnx_path, input_ids, attention_mask)
    latency_onnx = time.time() - start_time_onnx
    throughput_onnx = len(sample_batch) / latency_onnx

    print(f"PyTorch Inference - Latency: {latency_torch:.4f}s, Throughput: {throughput_torch:.2f} samples/s")
    print(f"ONNX Inference - Latency: {latency_onnx:.4f}s, Throughput: {throughput_onnx:.2f} samples/s")

    torch_preds = torch.argmax(torch_outputs, dim=1).tolist()
    onnx_preds = torch.argmax(onnx_outputs, dim=1).tolist()
    actual_labels = labels.tolist()

    # Compare predictions
    torch_report = classification_report(actual_labels, torch_preds, target_names=label_encoder.classes_, output_dict=True)
    onnx_report = classification_report(actual_labels, onnx_preds, target_names=label_encoder.classes_, output_dict=True)

    torch_df = pd.DataFrame(torch_report).transpose()
    onnx_df = pd.DataFrame(onnx_report).transpose()

    torch_df.to_csv(os.path.join(model_dir, "torch_classification_report.csv"))
    onnx_df.to_csv(os.path.join(model_dir, "onnx_classification_report.csv"))

    # Log metrics to TensorBoard
    writer.add_scalar("Latency/PyTorch", latency_torch)
    writer.add_scalar("Throughput/PyTorch", throughput_torch)
    writer.add_scalar("Latency/ONNX", latency_onnx)
    writer.add_scalar("Throughput/ONNX", throughput_onnx)

    writer.flush()
    writer.close()

    return torch_df, onnx_df


In [86]:
# def compute_class_weights(labels, num_classes):
#     counter = Counter(labels)
#     total_samples = len(labels)
#     weights = [total_samples / (num_classes * counter[i]) for i in range(num_classes)]
#     return torch.tensor(weights, dtype=torch.float)

In [87]:
# # Verify class weights
# def compute_class_weights(labels, num_classes):
#     class_counts = np.bincount(labels)
#     total_samples = len(labels)
#     weights = total_samples / (num_classes * class_counts)
#     return torch.tensor(weights, dtype=torch.float)


In [88]:
# Main Training Script
MODEL_NAME = "google/flan-t5-base"
model_alias = 'flan-t5-intent'
update_model_dict(model_alias, MODEL_NAME)

In [89]:

print(f"device: {get_device()}")

Using Apple MPS GPU
device: mps


In [90]:
df, label_encoder = load_and_preprocess_data()
balanced_df = balance_dataset(df)
balanced_df['conversation'] = balanced_df['conversation'].apply(preprocess_conversation)

Original distribution:
 issue_area
Cancellations and returns    286
Order                        270
Login and Account            151
Shopping                     116
Warranty                     105
Shipping                      72
Name: count, dtype: int64


In [91]:
# Tokenization and DataLoaders
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [92]:
# Create DataLoaders
train_loader, test_loader, test_df = create_dataloaders(balanced_df, tokenizer)

In [93]:
# Model Initialization and Training
num_classes = len(label_encoder.classes_)
model = FlanT5WithLoRA(num_labels=num_classes)


In [94]:
# Check class distribution
class_distribution = Counter(balanced_df['labels'])
print("Class Distribution:", class_distribution)

Class Distribution: Counter({0: 100, 5: 100, 3: 100, 1: 100, 2: 100, 4: 100})


In [95]:
class_weights = compute_class_weights(balanced_df['labels'], num_classes)
print("Class Weights:", class_weights)

Class Weights: tensor([1., 1., 1., 1., 1., 1.])


In [96]:
# Torch and nn: Core PyTorch library and neural network module.
# autocast: For mixed-precision training (automatically scales floating-point operations to reduce memory usage and increase speed).
# GradScaler: Scales the gradient values during mixed precision to prevent underflow.
# time: For measuring latency and time metrics.

In [97]:
# Clear cache and sync before training
torch.mps.empty_cache()
torch.mps.synchronize()


In [103]:


#Suppress warnings
warnings.filterwarnings("ignore")

# Device setup
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Environment variables
# The model uses Apple's Metal Performance Shaders (MPS) on your M4 Pro CPU if available, otherwise it falls back to CPU.
# The environment variable PYTORCH_MPS_HIGH_WATERMARK_RATIO is set to 0.0, which disables the high watermark limit to avoid unnecessary 
# memory consumption, helping prevent OOM (Out of Memory) errors.

os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'


# Batch size: Set to 1 to avoid memory issues on your MPS device.
# Gradient accumulation steps: Combines gradients over 4 batches before performing a backward pass to simulate a larger batch size, improving stability.
# Epochs: The model will train for 5 iterations over the entire dataset.
# Learning rate: Set to 1e-4 (a common value for fine-tuning models).
# Max gradient norm: Uses gradient clipping at 1.0 to avoid exploding gradients.
# Max length: Sequence length limited to 256 tokens.
# Logits store: A list to collect model predictions for later analysis.


# Hyperparameters
batch_size = 1  # Reduced batch size
gradient_accumulation_steps = 4
epochs = 3
learning_rate = 1e-4
max_norm = 1.0  # Gradient clipping
max_length = 256  # Reduced sequence length
logits_store = []

# Custom forward method: Overrides the FlanT5WithLoRA model's forward pass to disable hidden states output, reducing memory usage.
# This improves efficiency by only returning the logits and outputs, skipping the intermediate hidden states, which saves VRAM.

# Disable decoder hidden states
def forward_no_hidden_states(self, input_ids, attention_mask, labels):
    outputs = self.t5(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels,
        output_hidden_states=False  # No hidden states -> Less memory usage
    )
    logits = outputs.logits
    return logits, outputs

# Replace the original forward with the new one
FlanT5WithLoRA.forward = forward_no_hidden_states


# AdamW optimizer: Used for stable and efficient optimization with weight decay (0.01) to prevent overfitting.
# Cross-entropy loss: Used as the loss function since the task is likely a classification problem.

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()

# Ensures that all model parameters require gradient calculation, allowing them to be updated during backpropagation.

# Ensure all parameters require gradients
for param in model.parameters():
    param.requires_grad = True


# GradScaler: Scales gradients to avoid numerical instability during mixed-precision training.

# Mixed precision setup
scaler = GradScaler()

# Calculates accuracy, precision, recall, F1-score, and perplexity for the model's predictions.
# Perplexity: Measures how "surprised" the model is by the predictions (lower is better).
# Avoids divide-by-zero errors using 1e-12 in the log calculation.

# Metrics tracking
def calculate_metrics(logits, labels):
    """Calculate accuracy, precision, recall, F1, and perplexity."""
    preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted', zero_division=0)
    f1 = f1_score(labels, preds, average='weighted')

    # Calculate perplexity (avoiding divide by zero)
    logits = torch.softmax(logits, dim=-1).detach().cpu().numpy()
    entropy = -np.sum(logits * np.log(logits + 1e-12), axis=1)
    perplexity = np.exp(entropy).mean()

    return accuracy, precision, recall, f1, perplexity

# Iterates over the dataset for the defined number of epochs.
# Tracks:
# Running loss
# Metrics: Accuracy, precision, recall, F1, and perplexity
# Batch latencies


# Training loop
model.train()
for epoch in range(epochs):
    start_time = time.time()

    running_loss = 0.0
    total_accuracy = 0.0
    total_precision = 0.0
    total_recall = 0.0
    total_f1 = 0.0
    total_perplexity = 0.0

    batch_latencies = []

    print(f"\nEpoch {epoch + 1}/{epochs}")

    for step, batch in enumerate(train_loader):
        batch_start_time = time.time()

        optimizer.zero_grad()

        # Ensure data is on the correct device
        batch['input_ids'] = batch['input_ids'].to(device)
        batch['attention_mask'] = batch['attention_mask'].to(device)
        batch['labels'] = batch['labels'].to(device)

        # Uses autocast() for mixed-precision training, reducing memory consumption and increasing performance.
        # Accumulates gradients over multiple steps to simulate larger batch sizes.

        
        # Mixed precision with autocast
        with autocast():
            logits, outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )

            logits = logits.view(-1, logits.size(-1))
            labels = batch["labels"].view(-1)

            loss = criterion(logits, labels) / gradient_accumulation_steps

        # Backward pass with mixed precision
        scaler.scale(loss).backward(retain_graph=True)

        # Gradient accumulation: Only performs backward pass and optimizer step after multiple batches.
        # Gradient clipping: Ensures stability by limiting gradients to max_norm (1.0).
        
        # Gradient accumulation
        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        # Calculate batch metrics
        accuracy, precision, recall, f1, perplexity = calculate_metrics(logits, labels)

        # Store sample logits
        if step % 10 == 0:  # Store every 10 batches
            logits_store.append(logits[:5].detach().cpu().numpy())

        # Clear MPS cache
        torch.mps.empty_cache()
        torch.mps.synchronize()

        # Batch latency
        batch_latency = time.time() - batch_start_time
        batch_latencies.append(batch_latency)

        # Collects batch metrics and aggregates them for epoch-level evaluation.
        # Aggregate metrics
        running_loss += loss.item() * gradient_accumulation_steps
        total_accuracy += accuracy
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        total_perplexity += perplexity


        # Print batch-level metrics
        if step % 20 == 0:
            print(f"Step {step + 1}/{len(train_loader)} | Loss: {loss.item():.4f} | "
                  f"Accuracy: {accuracy:.4f} | Recall: {recall:.4f} | Precision: {precision:.4f} | "
                  f"F1 Score: {f1:.4f} | Perplexity: {perplexity:.3f} | Latency: {batch_latency:.3f} sec")

    # Epoch-level metrics
    epoch_latency = time.time() - start_time
    avg_loss = running_loss / len(train_loader)
    avg_accuracy = total_accuracy / len(train_loader)
    avg_precision = total_precision / len(train_loader)
    avg_recall = total_recall / len(train_loader)
    avg_f1 = total_f1 / len(train_loader)
    avg_perplexity = total_perplexity / len(train_loader)
    avg_batch_latency = np.mean(batch_latencies)

    print(f"Epoch Summary | Average Loss: {avg_loss:.4f} | Accuracy: {avg_accuracy:.4f} | "
          f"Recall: {avg_recall:.4f} | Precision: {avg_precision:.4f} | "
          f"F1 Score: {avg_f1:.4f} | Perplexity: {avg_perplexity:.4f} | "
          f"Average Batch Latency: {avg_batch_latency:.4f} sec | Epoch Latency: {epoch_latency:.2f}sec")




# Store logits for further evaluation
np.save("logits_store.npy", np.array(logits_store))
print("\nTraining Complete! 🎉")


Using device: mps

Epoch 1/3
Step 1/57 | Loss: 0.0346 | Accuracy: 0.9500 | Recall: 0.9500 | Precision: 0.9500 | F1 Score: 0.9485 | Perplexity: 1.627 | Latency: 13.318 sec
Step 21/57 | Loss: 0.0459 | Accuracy: 0.9500 | Recall: 0.9500 | Precision: 0.9563 | F1 Score: 0.9508 | Perplexity: 1.802 | Latency: 1.551 sec
Step 41/57 | Loss: 0.0096 | Accuracy: 1.0000 | Recall: 1.0000 | Precision: 1.0000 | F1 Score: 1.0000 | Perplexity: 1.274 | Latency: 1.211 sec
Epoch Summary | Average Loss: 0.1143 | Accuracy: 0.9774 | Recall: 0.9774 | Precision: 0.9812 | F1 Score: 0.9770 | Perplexity: 1.6156 | Average Batch Latency: 2.3686 sec | Epoch Latency: 135.89sec

Epoch 2/3
Step 1/57 | Loss: 0.0294 | Accuracy: 0.9750 | Recall: 0.9750 | Precision: 0.9917 | F1 Score: 0.9750 | Perplexity: 1.295 | Latency: 0.778 sec
Step 21/57 | Loss: 0.0452 | Accuracy: 0.9500 | Recall: 0.9500 | Precision: 0.9625 | F1 Score: 0.9542 | Perplexity: 1.199 | Latency: 0.828 sec
Step 41/57 | Loss: 0.0207 | Accuracy: 0.9500 | Recall: 

In [104]:
# Model Evaluation
evaluate_model(
    model,
    test_loader,
    tokenizer,
    label_encoder,
    model_alias
)

Using Apple MPS GPU

Classification Report:
                            precision    recall  f1-score   support

Cancellations and returns       0.63      0.95      0.76        20
        Login and Account       1.00      0.96      0.98        28
                    Order       0.95      0.60      0.73        30
                 Shipping       0.85      0.96      0.90        23
                 Shopping       0.95      0.91      0.93        22
                 Warranty       1.00      1.00      1.00        27

                 accuracy                           0.89       150
                macro avg       0.90      0.90      0.88       150
             weighted avg       0.91      0.89      0.89       150


Per-class Metrics:
                     Class  Precision   Recall  F1-Score  Support
Cancellations and returns   0.633333 0.950000  0.760000       20
        Login and Account   1.000000 0.964286  0.981818       28
                    Order   0.947368 0.600000  0.734694       30
 

(                       Class  Precision    Recall  F1-Score  Support
 0  Cancellations and returns   0.633333  0.950000  0.760000       20
 1          Login and Account   1.000000  0.964286  0.981818       28
 2                      Order   0.947368  0.600000  0.734694       30
 3                   Shipping   0.846154  0.956522  0.897959       23
 4                   Shopping   0.952381  0.909091  0.930233       22
 5                   Warranty   1.000000  1.000000  1.000000       27,
                            Cancellations and returns  Login and Account  \
 Cancellations and returns                         19                  0   
 Login and Account                                  0                 27   
 Order                                             10                  0   
 Shipping                                           1                  0   
 Shopping                                           0                  0   
 Warranty                                           0

In [None]:
# train_model(model,train_loader, tokenizer,model_alias=model_alias, epochs=1, learning_rate=2e-5)

In [None]:
compare_inference_performance(model, tokenizer, test_df, label_encoder, model_alias=model_alias)

In [None]:
tokenizer.save_pretrained(f"model-metric/{model_alias}/tokenizer/")