# PyTorch model with 5 modes (for now)
*Step 1:* Collect and label data</br>
*Step 2:* process data and normalize it!</br>
*Step 3:* Convert text to numeric vectors</br>
*Step 4:* Train a Lightweight Classifier</br>
*Step 5:* Evaluate Performance</br>
*Step 6:* Save and Load model to the project</br>

### SQL_mode, insight_mode, Comparison_mode, visualization_mode, prediction_mode


In [442]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
from pathlib import Path
import os
from model import MLPClassifier
from datetime import datetime
import random
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [443]:
### Prepare the data and join into one df
comparison_df = pd.read_csv("training_data_modes/comparison_mode.csv")
insight_df = pd.read_csv("training_data_modes/insight_mode.csv")
prediction_df = pd.read_csv("training_data_modes/prediction_mode.csv")
sql_df = pd.read_csv("training_data_modes/sql_mode.csv")
visualization_df = pd.read_csv("training_data_modes/visualization_mode.csv")



df = pd.concat([comparison_df, insight_df, prediction_df, sql_df, visualization_df])
X, y = df['text'], df['label']

label_to_int = {
  "sql_mode": 0,
  "insight_mode": 1,
  "comparison_mode": 2,
  "visualization_mode": 3,
  "prediction_mode": 4
}

for _, row in df.iterrows():
    row['label'] = label_to_int[row['label']]
df.head()

Unnamed: 0,text,label
0,Which period had better transactions?,2
1,Which period had better expenses?,2
2,How does conversion rate differ between regions?,2
3,How does production costs differ between regions?,2
4,Compare customers from this quarter to last qu...,2


In [444]:
#### Shuffle the data and split the train test.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = y_train.tolist()
y_test = y_test.tolist()

In [445]:
### Get the embedding model to translate text to vector
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode train/test text sets
# (1) Encode only once and save embeddings
if not os.path.exists("train_embeddings.npy"):
    X_train_embeddings = model.encode(X_train)
    np.save("train_embeddings.npy", X_train_embeddings)
else:
    X_train_embeddings = np.load("train_embeddings.npy")
X_test_embeddings = model.encode(X_test)

# Convert embeddings to tensors
X_train_tensor = torch.tensor(X_train_embeddings, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_embeddings, dtype=torch.float32)

# Convert labels to tensors (important!)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [446]:
### Check if GPU is available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [447]:
MODEL_PATH = "models/mlp_mode_classifier.pt"
model = MLPClassifier(input_dim=384, hidden1=128, hidden2=64, output_dim=5)

if os.path.exists(MODEL_PATH):
    checkpoint = torch.load(MODEL_PATH)
    model.load_state_dict(checkpoint['model_state_dict']) 

In [448]:

classes = np.unique(y_train)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
weights = torch.tensor(weights, dtype=torch.float32)

criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

# Load optimizer state if resuming training
if os.path.exists(MODEL_PATH):
    checkpoint = torch.load(MODEL_PATH)
    if 'optimizer_state_dict' in checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [449]:
### Optimize the NN
NUM_EPOCHS = 20
best_accuracy = 0

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    
    for embeddings, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    scheduler.step()
    
    # Validation after each epoch
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for embeddings, labels in test_loader:
            outputs = model(embeddings)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    avg_loss = total_loss / len(train_loader)
    
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    # Save best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'accuracy': accuracy
        }, "models/mlp_mode_classifier.pt")

Epoch 1/20 - Loss: 1.5607, Accuracy: 0.6800
Epoch 2/20 - Loss: 1.2265, Accuracy: 0.8850
Epoch 3/20 - Loss: 0.6212, Accuracy: 0.9150
Epoch 4/20 - Loss: 0.3023, Accuracy: 0.9300
Epoch 5/20 - Loss: 0.2062, Accuracy: 0.9400
Epoch 6/20 - Loss: 0.1509, Accuracy: 0.9600
Epoch 7/20 - Loss: 0.1195, Accuracy: 0.9700
Epoch 8/20 - Loss: 0.1049, Accuracy: 0.9800
Epoch 9/20 - Loss: 0.0928, Accuracy: 0.9850
Epoch 10/20 - Loss: 0.0834, Accuracy: 0.9850
Epoch 11/20 - Loss: 0.0784, Accuracy: 0.9850
Epoch 12/20 - Loss: 0.0734, Accuracy: 0.9850
Epoch 13/20 - Loss: 0.0699, Accuracy: 0.9850
Epoch 14/20 - Loss: 0.0678, Accuracy: 0.9850
Epoch 15/20 - Loss: 0.0658, Accuracy: 0.9850
Epoch 16/20 - Loss: 0.0641, Accuracy: 0.9850
Epoch 17/20 - Loss: 0.0631, Accuracy: 0.9850
Epoch 18/20 - Loss: 0.0622, Accuracy: 0.9850
Epoch 19/20 - Loss: 0.0614, Accuracy: 0.9850
Epoch 20/20 - Loss: 0.0609, Accuracy: 0.9850


In [451]:
LOG_PATH = "training_log.json"

results = {
    "timestamp": datetime.now().isoformat(),
    "accuracy": float(accuracy),
    "report": classification_report(all_labels, all_preds, output_dict=True, zero_division=0)
}

with open(LOG_PATH, "a") as f:
    json.dump(results, f)
    f.write("\n")


In [453]:
label_map = {
    0: "sql_mode",
    1: "insight_mode",
    2: "comparison_mode",
    3: "visualization_mode",
    4: "prediction_mode"
}

with open("models/label_map.json", "w") as f:
    json.dump(label_map, f)