In [None]:
# ==========================================
# MILESTONE 2: EMAIL CATEGORIZATION ENGINE
# ==========================================
print("\n" + "="*50)
print("STARTING MILESTONE 2: MODEL TRAINING & EVALUATION")
print("="*50)

# Necessary Imports
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import random

# Define target names for reports (Must match your label map order: 0, 1, 2, 3)
target_names = ['spam', 'complaint', 'request', 'feedback'] 

# -------------------------------------------------------
# TASK 1: TRAIN BASELINE CLASSIFIERS
# -------------------------------------------------------

# --- A. Logistic Regression ---
print("\n--- 1. Training Logistic Regression ---")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(x_train, y_train)

lr_pred = lr_model.predict(x_test)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, lr_pred):.4f}")
print("Classification Report (LR):")
print(classification_report(y_test, lr_pred, target_names=target_names))

# --- B. Naive Bayes ---
print("\n--- 2. Training Naive Bayes ---")
nb_model = MultinomialNB()
nb_model.fit(x_train, y_train)

nb_pred = nb_model.predict(x_test)
print(f"Naive Bayes Accuracy: {accuracy_score(y_test, nb_pred):.4f}")
print("Classification Report (NB):")
print(classification_report(y_test, nb_pred, target_names=target_names))


# -------------------------------------------------------
# TASK 2: FINE-TUNE TRANSFORMER MODEL (DistilBERT)
# -------------------------------------------------------
print("\n--- 3. Fine-tuning DistilBERT (Transformer) ---")

# 1. Prepare Data for BERT
label_map = {'spam': 0, 'complaint': 1, 'request': 2, 'feedback': 3}

bert_texts = df_final['Cleaned_Text'].tolist()
bert_labels = df_final['category'].map(label_map).tolist()

# Split data (using lowercase names to match your previous code style)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    bert_texts, bert_labels, test_size=0.2, random_state=3
)

# 2. Tokenization
print("Loading Tokenizer...")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

print("Tokenizing data...")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# 3. Create Dataset Class
class EmailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, train_labels)
val_dataset = EmailDataset(val_encodings, val_labels)

# 4. Load Model
print("Loading Model...")
model_bert = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4
)

# 5. Training Setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",  
    save_strategy="no"
)

trainer = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# 6. Train
print("Starting Training (Fine-tuning)...")
trainer.train()

# 7. Evaluate DistilBERT (Detailed Report)
print("\n--- DistilBERT Evaluation ---")

# Predict on the validation set
predictions = trainer.predict(val_dataset)
# Convert logits to class IDs
preds = np.argmax(predictions.predictions, axis=-1)

# Print the full Classification Report
print(f"DistilBERT Accuracy: {accuracy_score(val_labels, preds):.4f}")
print("Classification Report (DistilBERT):")
print(classification_report(val_labels, preds, target_names=target_names))


# -------------------------------------------------------
# TASK 3: PREDICTION TEST (REAL DATA)
# -------------------------------------------------------
print("\n" + "="*50)
print("PREDICTION TEST ON RANDOM DATASET SAMPLE")
print("="*50)

# 1. Pick a random row
random_index = random.randint(0, len(df_final) - 1)
sample_row = df_final.iloc[random_index]

input_text = sample_row['text']       
true_label = sample_row['category']   

# 2. Preprocess
cleaned_input = clean_email(input_text)

# 3. Predict with DistilBERT
inputs = tokenizer(cleaned_input, return_tensors="pt", truncation=True, padding=True, max_length=128)
inputs = {k: v.to(model_bert.device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model_bert(**inputs)

predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
reverse_label_map = {v: k for k, v in label_map.items()}
predicted_result = reverse_label_map[predicted_class_id]

# 4. Display Results
print(f"Input Email :\n'{input_text}'\n")
print(f"Predicted Label : {predicted_result.upper()}")
print(f"Actual Label : {true_label.upper()}")

if true_label == predicted_result:
    print("\n Result: ACCURATE")
else:
    print("\n Result: INCORRECT")


--- Logistic Regression Report ---


NameError: name 'model' is not defined