In [1]:
# !pip install safetensors \
#     nvidia-pyindex nvidia-tensorrt \
#     tritonclient[http]

In [2]:
# !pip install onnx

In [2]:
# --- Common Utilities and Setup ---
import os
import json
import torch
import transformers
import accelerate
import huggingface_hub
import peft
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.preprocessing import LabelEncoder
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score
from sklearn.utils import resample
from collections import Counter
import time
import onnxruntime as ort
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List
import pickle

print("peft:", peft.__version__)
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("Accelerate:", accelerate.__version__)
print("Huggingface Hub:", huggingface_hub.__version__)

# Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

peft: 0.14.0
Torch: 2.2.2
Transformers: 4.49.0
Accelerate: 1.4.0
Huggingface Hub: 0.29.1


In [3]:
def update_model_dict(model_alias, MODEL_NAME):
    if not os.path.exists('model_dict.json'):
        model_dict = {}
    else:
        with open('model_dict.json', 'r') as file:
            model_dict = json.load(file)

    model_dict[model_alias] = MODEL_NAME

    with open('model_dict.json', 'w') as file:
        json.dump(model_dict, file)

In [5]:
def load_and_preprocess_data(filepath="./data/train-00000-of-00001-a5a7c6e4bb30b016.parquet"):
    """Loads and preprocesses the dataset."""
    df = pd.read_parquet(filepath)
    df = df[['conversation', 'issue_area']]
    print("Original distribution:\n", df['issue_area'].value_counts())
    label_encoder = LabelEncoder()
    df["labels"] = label_encoder.fit_transform(df["issue_area"])

    #saving Label-encoder
    label_encoder_path = f"model-metric/{model_alias}/label_encoder.pkl"
    os.makedirs(os.path.dirname(label_encoder_path), exist_ok=True)
    with open(label_encoder_path, "wb") as f:
        pickle.dump(label_encoder, f)
        
    return df, label_encoder

In [6]:
def balance_dataset(df, max_count=100, random_state=42):
    """Balances the dataset using oversampling."""
    balanced_df = pd.DataFrame()
    for issue in df['issue_area'].unique():
        subset = df[df['issue_area'] == issue]
        balanced_subset = resample(subset, replace=True, n_samples=max_count, random_state=random_state)
        balanced_df = pd.concat([balanced_df, balanced_subset])
    return balanced_df.sample(frac=1, random_state=random_state).reset_index(drop=True)


In [7]:
def preprocess_conversation(conversation):
    """Preprocesses a conversation."""
    if isinstance(conversation, list):
        return " ".join([turn.get('text', '') for turn in conversation if isinstance(turn, dict)])
    return str(conversation) #.lower()

In [8]:
# Define PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        row = self.data.iloc[index]
        inputs = self.tokenizer(
            row["conversation"],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)
        label = torch.tensor(row["labels"], dtype=torch.long)
        return input_ids, attention_mask, label

In [9]:
def create_dataloaders(df, tokenizer, batch_size=8, train_ratio=0.75):
    """Creates train and test DataLoaders."""
    train_size = int(train_ratio * len(df))
    train_df, test_df = df[:train_size], df[train_size:]
    train_dataset = CustomDataset(train_df, tokenizer)
    test_dataset = CustomDataset(test_df, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader, test_df

In [10]:
class DistilBERTCased(nn.Module):
    def __init__(self, num_labels, lora_r=8, lora_alpha=16, lora_dropout=0.1):
        super(DistilBERTCased, self).__init__()
        # Load the base model with the correct number of labels
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            "distilbert/distilbert-base-cased",
            num_labels=num_labels  # Ensure this matches the number of classes
        )
        
        # # LoRA Configuration
        # lora_config = LoraConfig(
        #     task_type=TaskType.SEQ_CLS,
        #     r=lora_r,
        #     lora_alpha=lora_alpha,
        #     lora_dropout=lora_dropout,
        #     target_modules=["q_lin", "k_lin", "v_lin"]
        # )
        # self.bert = get_peft_model(self.bert, lora_config)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits  # Return the logits directly

In [11]:
# Function to compute class weights
def compute_class_weights(labels, num_classes):
    counter = Counter(labels)
    total_samples = len(labels)
    weights = [total_samples / (num_classes * counter[i]) for i in range(num_classes)]
    return torch.tensor(weights, dtype=torch.float)

In [12]:
def train_model(model, train_loader, model_alias, epochs=3, learning_rate=5e-5, class_weights=None):
    """Trains the model and saves logs, metrics, and model weights."""
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()

    # Create directory for storing model metrics
    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)

    # TensorBoard writer in the model directory
    writer = SummaryWriter(log_dir=model_dir)

    # Set up loss function and optimizer
    criterion = torch.nn.CrossEntropyLoss(weight=class_weights.to(device) if class_weights is not None else None)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    epoch_losses = []
    metrics_data = []

    for epoch in range(epochs):
        start_time = time.time()
        total_loss = 0
        all_preds, all_labels = [], []

        for batch_idx, batch in enumerate(train_loader):
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().tolist()
            labels = labels.cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels)

            # Log batch loss every 10 batches
            if batch_idx % 10 == 0:
                writer.add_scalar("BatchLoss/train", loss.item(), epoch * len(train_loader) + batch_idx)

        # Compute epoch metrics
        avg_loss = total_loss / len(train_loader)
        epoch_losses.append(avg_loss)
        accuracy = accuracy_score(all_labels, all_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
        epoch_time = time.time() - start_time

        # Store metrics for CSV logging
        metrics_data.append([epoch + 1, avg_loss, accuracy, precision, recall, f1, epoch_time])

        # Print metrics
        print(f"Epoch {epoch+1}: Loss={avg_loss:.4f}, Accuracy={accuracy:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, F1-score={f1:.4f}, Time={epoch_time:.2f}s")

        # Log metrics to TensorBoard
        writer.add_scalar("Loss/train", avg_loss, epoch)
        writer.add_scalar("Accuracy/train", accuracy, epoch)
        writer.add_scalar("Precision/train", precision, epoch)
        writer.add_scalar("Recall/train", recall, epoch)
        writer.add_scalar("F1-score/train", f1, epoch)
        writer.add_scalar("Time/Epoch", epoch_time, epoch)

    # Save model KPIs as CSV
    metrics_df = pd.DataFrame(metrics_data, columns=["Epoch", "Loss", "Accuracy", "Precision", "Recall", "F1-score", "Time (s)"])
    metrics_df.to_csv(os.path.join(model_dir, "training_metrics.csv"), index=False)

    # Save training loss curve
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, epochs + 1), epoch_losses, marker='o', linestyle='-', color='b')
    plt.title('Training Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    loss_plot_path = os.path.join(model_dir, "training_loss.png")
    plt.savefig(loss_plot_path)
    writer.add_figure("Training Loss", plt.gcf(), close=True)

    # Save model weights
    model_path = os.path.join(model_dir, f"{model_alias}.pth")
    torch.save(model.state_dict(), model_path)

    writer.flush()
    writer.close()


In [13]:
import os
import time
import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

def evaluate_model(model, test_loader, label_encoder, model_alias):
    """Evaluates the model and saves metrics, logs, and confusion matrix."""

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Create directory for storing model metrics
    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)

    # Initialize TensorBoard writer
    writer = SummaryWriter(log_dir=model_dir)

    all_preds, all_labels = [], []
    start_time = time.time()

    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().tolist()
            labels = labels.cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels)

    eval_time = time.time() - start_time
    class_names = label_encoder.classes_

    # Compute metrics
    precision, recall, f1, support = precision_recall_fscore_support(all_labels, all_preds, average=None)
    class_metrics = pd.DataFrame({
        'Class': class_names,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Support': support
    })

    overall_precision, overall_recall, overall_f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    # Print and save classification report
    print("\nClassification Report:\n", classification_report(all_labels, all_preds, target_names=class_names))

    # Generate confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)

    # Plot confusion matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()

    # Save confusion matrix plot
    confusion_matrix_path = os.path.join(model_dir, "confusion_matrix.png")
    plt.savefig(confusion_matrix_path)
    writer.add_figure("Confusion Matrix", plt.gcf(), close=True)

    # Print overall metrics
    print("\nPer-class Metrics:\n", class_metrics.to_string(index=False))
    print(f"\nOverall Metrics:\nPrecision: {overall_precision:.4f}, Recall: {overall_recall:.4f}, F1-score: {overall_f1:.4f}, Eval Time: {eval_time:.2f}s")

    # Log metrics to TensorBoard
    writer.add_scalar("Precision/test", overall_precision)
    writer.add_scalar("Recall/test", overall_recall)
    writer.add_scalar("F1-score/test", overall_f1)
    writer.add_scalar("Evaluation Time", eval_time)

    # Log per-class metrics
    for i, class_name in enumerate(class_names):
        writer.add_scalar(f"Precision/{class_name}", precision[i])
        writer.add_scalar(f"Recall/{class_name}", recall[i])
        writer.add_scalar(f"F1-score/{class_name}", f1[i])

    writer.flush()
    writer.close()

    # Save evaluation metrics
    class_metrics.to_csv(os.path.join(model_dir, "class_metrics.csv"), index=False)
    cm_df.to_csv(os.path.join(model_dir, "confusion_matrix.csv"))

    return class_metrics, cm_df


In [14]:
import os
import time
import torch
import numpy as np
import onnxruntime as ort
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from torch.utils.tensorboard import SummaryWriter

def export_to_onnx(model, tokenizer, model_alias):
    """Exports the model to ONNX format."""
    model.eval().to("cpu")
    sample_input = tokenizer("test", return_tensors="pt")
    input_names = ["input_ids", "attention_mask"]
    output_names = ["output"]
    
    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)
    onnx_path = os.path.join(model_dir, f"{model_alias}.onnx")
    
    torch.onnx.export(
        model, 
        (sample_input["input_ids"], sample_input["attention_mask"]), 
        onnx_path, 
        input_names=input_names, 
        output_names=output_names, 
        dynamic_axes={
            "input_ids": {0: "batch", 1: "sequence"}, 
            "attention_mask": {0: "batch", 1: "sequence"}, 
            "output": {0: "batch"}
        }
    )
    print(f"ONNX model exported to {onnx_path}")
    return onnx_path

def run_onnx_inference(onnx_path, input_ids, attention_mask):
    """Runs inference using ONNX Runtime."""
    ort_session = ort.InferenceSession(onnx_path)
    ort_inputs = {"input_ids": input_ids.tolist(), "attention_mask": attention_mask.tolist()}
    ort_outputs = ort_session.run(None, ort_inputs)
    return torch.tensor(np.array(ort_outputs[0]), dtype=torch.float32)

def compare_inference_performance(model, tokenizer, test_df, label_encoder, model_alias):
    """Compares inference performance between PyTorch and ONNX Runtime."""
    model_dir = f"model-metric/{model_alias}"
    os.makedirs(model_dir, exist_ok=True)
    writer = SummaryWriter(log_dir=model_dir)
    
    sample_batch = test_df.sample(50)
    test_dataset_batch = CustomDataset(sample_batch, tokenizer)
    test_loader_batch = DataLoader(test_dataset_batch, batch_size=len(sample_batch), shuffle=False)
    batch = next(iter(test_loader_batch))
    input_ids, attention_mask, labels = batch
    
    # PyTorch Inference
    model.eval().to(device)
    start_time_torch = time.time()
    with torch.no_grad():
        torch_outputs = model(input_ids.to(device), attention_mask.to(device)).cpu()
    latency_torch = time.time() - start_time_torch
    throughput_torch = len(sample_batch) / latency_torch
    
    # ONNX Inference
    onnx_path = export_to_onnx(model, tokenizer, model_alias)
    start_time_onnx = time.time()
    onnx_outputs = run_onnx_inference(onnx_path, input_ids, attention_mask)
    latency_onnx = time.time() - start_time_onnx
    throughput_onnx = len(sample_batch) / latency_onnx
    
    print(f"PyTorch Inference - Latency: {latency_torch:.4f}s, Throughput: {throughput_torch:.2f} samples/s")
    print(f"ONNX Inference - Latency: {latency_onnx:.4f}s, Throughput: {throughput_onnx:.2f} samples/s")
    
    torch_preds = torch.argmax(torch_outputs, dim=1).tolist()
    onnx_preds = torch.argmax(onnx_outputs, dim=1).tolist()
    actual_labels = labels.tolist()
    
    # Compare predictions
    torch_report = classification_report(actual_labels, torch_preds, target_names=label_encoder.classes_, output_dict=True)
    onnx_report = classification_report(actual_labels, onnx_preds, target_names=label_encoder.classes_, output_dict=True)
    
    torch_df = pd.DataFrame(torch_report).transpose()
    onnx_df = pd.DataFrame(onnx_report).transpose()
    
    torch_df.to_csv(os.path.join(model_dir, "torch_classification_report.csv"))
    onnx_df.to_csv(os.path.join(model_dir, "onnx_classification_report.csv"))
    
    # Log metrics to TensorBoard
    writer.add_scalar("Latency/PyTorch", latency_torch)
    writer.add_scalar("Throughput/PyTorch", throughput_torch)
    writer.add_scalar("Latency/ONNX", latency_onnx)
    writer.add_scalar("Throughput/ONNX", throughput_onnx)
    
    writer.flush()
    writer.close()
    
    return torch_df, onnx_df


In [15]:
def compute_class_weights(labels, num_classes):
    counter = Counter(labels)
    total_samples = len(labels)
    weights = [total_samples / (num_classes * counter[i]) for i in range(num_classes)]
    return torch.tensor(weights, dtype=torch.float)

In [16]:
MODEL_NAME = "distilbert/distilbert-base-cased"
model_alias = 'distilbert-cased-benchmark'
update_model_dict(model_alias, MODEL_NAME)

In [17]:
df, label_encoder = load_and_preprocess_data()
balanced_df = balance_dataset(df)
balanced_df['conversation'] = balanced_df['conversation'].apply(preprocess_conversation)

Original distribution:
 issue_area
Cancellations and returns    286
Order                        270
Login and Account            151
Shopping                     116
Warranty                     105
Shipping                      72
Name: count, dtype: int64


In [18]:
    # Tokenization and DataLoaders
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")
train_loader, test_loader, test_df = create_dataloaders(balanced_df, tokenizer)

In [19]:
# Model Initialization and Training
num_classes = len(label_encoder.classes_)
model = DistilBERTCased(num_labels=num_classes)
class_weights = compute_class_weights(balanced_df['labels'], num_classes)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
train_model(model, train_loader,model_alias=model_alias, epochs=10, learning_rate=5e-5, class_weights=class_weights)

Epoch 1: Loss=1.3835, Accuracy=0.4556, Precision=0.4701, Recall=0.4556, F1-score=0.4471, Time=142.51s
Epoch 2: Loss=0.5706, Accuracy=0.8156, Precision=0.8077, Recall=0.8156, F1-score=0.8068, Time=146.39s
Epoch 3: Loss=0.2493, Accuracy=0.9200, Precision=0.9197, Recall=0.9200, F1-score=0.9195, Time=153.42s
Epoch 4: Loss=0.1277, Accuracy=0.9578, Precision=0.9576, Recall=0.9578, F1-score=0.9573, Time=141.53s
Epoch 5: Loss=0.1531, Accuracy=0.9533, Precision=0.9531, Recall=0.9533, F1-score=0.9524, Time=170.71s
Epoch 6: Loss=0.0725, Accuracy=0.9800, Precision=0.9802, Recall=0.9800, F1-score=0.9800, Time=158.56s
Epoch 7: Loss=0.0684, Accuracy=0.9844, Precision=0.9847, Recall=0.9844, F1-score=0.9845, Time=160.94s
Epoch 8: Loss=0.0134, Accuracy=1.0000, Precision=1.0000, Recall=1.0000, F1-score=1.0000, Time=164.07s
Epoch 9: Loss=0.0095, Accuracy=0.9978, Precision=0.9978, Recall=0.9978, F1-score=0.9978, Time=161.08s
Epoch 10: Loss=0.0963, Accuracy=0.9689, Precision=0.9689, Recall=0.9689, F1-score=

In [21]:
# Model Evaluation
evaluate_model(model, test_loader, label_encoder, model_alias)


Classification Report:
                            precision    recall  f1-score   support

Cancellations and returns       0.80      0.80      0.80        20
        Login and Account       1.00      1.00      1.00        28
                    Order       0.81      0.87      0.84        30
                 Shipping       0.92      1.00      0.96        23
                 Shopping       1.00      0.86      0.93        22
                 Warranty       1.00      0.96      0.98        27

                 accuracy                           0.92       150
                macro avg       0.92      0.92      0.92       150
             weighted avg       0.92      0.92      0.92       150


Per-class Metrics:
                     Class  Precision   Recall  F1-Score  Support
Cancellations and returns     0.8000 0.800000  0.800000       20
        Login and Account     1.0000 1.000000  1.000000       28
                    Order     0.8125 0.866667  0.838710       30
                 Ship

(                       Class  Precision    Recall  F1-Score  Support
 0  Cancellations and returns     0.8000  0.800000  0.800000       20
 1          Login and Account     1.0000  1.000000  1.000000       28
 2                      Order     0.8125  0.866667  0.838710       30
 3                   Shipping     0.9200  1.000000  0.958333       23
 4                   Shopping     1.0000  0.863636  0.926829       22
 5                   Warranty     1.0000  0.962963  0.981132       27,
                            Cancellations and returns  Login and Account  \
 Cancellations and returns                         16                  0   
 Login and Account                                  0                 28   
 Order                                              4                  0   
 Shipping                                           0                  0   
 Shopping                                           0                  0   
 Warranty                                           0

In [22]:
compare_inference_performance(model, tokenizer, test_df, label_encoder, model_alias=model_alias)

ONNX model exported to model-metric/distilbert-cased-benchmark/distilbert-cased-benchmark.onnx
PyTorch Inference - Latency: 3.3130s, Throughput: 15.09 samples/s
ONNX Inference - Latency: 5.7580s, Throughput: 8.68 samples/s


(                           precision    recall  f1-score  support
 Cancellations and returns      0.800  1.000000  0.888889     4.00
 Login and Account              1.000  1.000000  1.000000     9.00
 Order                          0.900  0.900000  0.900000    10.00
 Shipping                       1.000  1.000000  1.000000     7.00
 Shopping                       1.000  0.909091  0.952381    11.00
 Warranty                       1.000  1.000000  1.000000     9.00
 accuracy                       0.960  0.960000  0.960000     0.96
 macro avg                      0.950  0.968182  0.956878    50.00
 weighted avg                   0.964  0.960000  0.960635    50.00,
                            precision    recall  f1-score  support
 Cancellations and returns      0.800  1.000000  0.888889     4.00
 Login and Account              1.000  1.000000  1.000000     9.00
 Order                          0.900  0.900000  0.900000    10.00
 Shipping                       1.000  1.000000  1.000000    

In [23]:
tokenizer.save_pretrained(f"model-metric/{model_alias}/tokenizer/")

('model-metric/distilbert-cased-benchmark/tokenizer/tokenizer_config.json',
 'model-metric/distilbert-cased-benchmark/tokenizer/special_tokens_map.json',
 'model-metric/distilbert-cased-benchmark/tokenizer/vocab.txt',
 'model-metric/distilbert-cased-benchmark/tokenizer/added_tokens.json',
 'model-metric/distilbert-cased-benchmark/tokenizer/tokenizer.json')