In [None]:
# --- Step 0: Install necessary libraries ---
# !pip install transformers[torch] pandas scikit-learn accelerate

import pandas as pd
import numpy as np
import torch
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer
)
import warnings
warnings.filterwarnings('ignore')

os.environ["WANDB_DISABLED"] = "true"

# --- Step 1: Load and Prepare the Kaggle Dataset ---
print("--- Step 1: Loading data from emails.csv ---")

try:
    df = pd.read_csv('emails.csv')
except FileNotFoundError:
    print("Error: 'emails.csv' not found. Please download it from Kaggle and place it in the same directory.")
    exit()

df = df.rename(columns={'spam': 'label'})

df = df.dropna()
df = df.reset_index(drop=True)

print("Dataset loaded successfully.")
print(f"Number of emails: {len(df)}")
print("Class distribution:")
print(df['label'].value_counts())
print("\n" + "="*50 + "\n")


# --- Step 2: Preprocess Data for Fine-Tuning ---
print("--- Step 2: Preprocessing Data for BERT ---")

le = LabelEncoder()
le.fit(df['label']) 
num_labels = len(le.classes_)

print(f"Found {num_labels} unique labels: {le.classes_}")

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,         
    random_state=42,
    stratify=df['label']   
)

print(f"\nTraining samples: {len(train_texts)}, Validation samples: {len(val_texts)}")
print("\n" + "="*50 + "\n")


# --- Step 3: Load BERT Model and Tokenizer ---
print("--- Step 3: Loading BERT Model and Tokenizer ---")
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
print("Model and tokenizer loaded successfully.")
print("\n" + "="*50 + "\n")


# --- Step 4: Convert Data to PyTorch Tensors ---
print("--- Step 4: Converting datasets manually to PyTorch tensors ---")

def convert_dataset_manually(texts, labels, tokenizer, max_length=128):
    """Tokenize texts and convert to a dictionary of torch tensors."""
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
    return {
        'input_ids': torch.tensor(encodings['input_ids']),
        'attention_mask': torch.tensor(encodings['attention_mask']),
        'labels': torch.tensor(labels, dtype=torch.long)
    }

train_data = convert_dataset_manually(train_texts, train_labels, tokenizer)
val_data = convert_dataset_manually(val_texts, val_labels, tokenizer)

class ManualDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = ManualDataset(train_data)
val_dataset = ManualDataset(val_data)
print("PyTorch datasets created.")
print("\n" + "="*50 + "\n")


# --- Step 5: Fine-Tune the Model ---
print("--- Step 5: Fine-Tuning the BERT Model ---")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy': accuracy_score(labels, predictions)}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,             
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
print("Training complete.")
print("\n" + "="*50 + "\n")


# --- Step 6: Evaluate and Test the Fine-Tuned Model ---
print("--- Step 6: Evaluating and Testing the Model ---")

eval_results = trainer.evaluate()
print("Final validation results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

model.save_pretrained('./spam_classifier_model')
tokenizer.save_pretrained('./spam_classifier_model')
print("Model saved to './spam_classifier_model'")

def classify_email(text, return_confidence=False):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    predicted_class_id = torch.argmax(logits).item()
    confidence = probabilities[0][predicted_class_id].item()
    
    predicted_label_num = le.inverse_transform([predicted_class_id])[0]
    predicted_label = "Spam" if predicted_label_num == 1 else "Not Spam"
    
    if return_confidence:
        return predicted_label, confidence
    return predicted_label

print("\n--- Testing Classifier on New Emails ---")
test_examples = [
    "Subject: Your invoice is ready\nHi there, we've attached the invoice for your recent order. Payment is due in 30 days.",
    "WINNER! you have been selected to receive a $1000 gift card. click here now",
    "Let's schedule a meeting for next week to discuss the project proposal.",
    "Limited time offer: Get 50% off all our products today only!"
]

for i, example in enumerate(test_examples, 1):
    prediction, confidence = classify_email(example, return_confidence=True)
    print(f"\nExample {i}:")
    print(f"Text: '{example}'")
    print(f"-> Predicted Category: {prediction} (Confidence: {confidence:.4f})")
