In [None]:
# Complete workaround for the numpy copy issue
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import torch
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import warnings
warnings.filterwarnings('ignore')

# Disable wandb
import os
os.environ["WANDB_DISABLED"] = "true"

# Use your existing preprocessing code up to tokenization
# ... (your data loading and preprocessing code) ...

# FIXED: Manual conversion that avoids the datasets library issue
def convert_dataset_manually(texts, labels, tokenizer, max_length=128):
    """Convert texts and labels to torch tensors manually"""
    input_ids_list = []
    attention_mask_list = []

    for text in texts:
        # Tokenize each text individually
        encoded = tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )

        input_ids_list.append(encoded['input_ids'].squeeze())
        attention_mask_list.append(encoded['attention_mask'].squeeze())

    return {
        'input_ids': torch.stack(input_ids_list),
        'attention_mask': torch.stack(attention_mask_list),
        'labels': torch.tensor(labels, dtype=torch.long)
    }

# Convert datasets manually (this bypasses the datasets library entirely)
print("Converting datasets manually...")
train_data = convert_dataset_manually(train_texts, train_labels, tokenizer)
val_data = convert_dataset_manually(val_texts, val_labels, tokenizer)

print(f"Train data shape: {train_data['input_ids'].shape}")
print(f"Val data shape: {val_data['input_ids'].shape}")

# Custom Dataset class
class ManualDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Create datasets
train_dataset_manual = ManualDataset(
    train_data['input_ids'],
    train_data['attention_mask'],
    train_data['labels']
)

val_dataset_manual = ManualDataset(
    val_data['input_ids'],
    val_data['attention_mask'],
    val_data['labels']
)

# Compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

# Training arguments
training_args = TrainingArguments(
    output_dir='/content/email_classifier_results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='/content/logs',
    logging_steps=10,
    save_total_limit=1,
    dataloader_num_workers=0,  # Critical: No multiprocessing
    fp16=False,
    report_to='none',
    run_name='email_classifier_training'
)

# Simple data collator (since our data is already properly formatted)
def simple_data_collator(features):
    batch = {}
    batch['input_ids'] = torch.stack([f['input_ids'] for f in features])
    batch['attention_mask'] = torch.stack([f['attention_mask'] for f in features])
    batch['labels'] = torch.stack([f['labels'] for f in features])
    return batch

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_manual,
    eval_dataset=val_dataset_manual,
    data_collator=simple_data_collator,
    compute_metrics=compute_metrics
)

# Train the model
print("\nStarting training...")
trainer.train()

# Evaluate
print("\nEvaluating model...")
eval_results = trainer.evaluate()
print("Evaluation results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# Save model
model.save_pretrained('/content/email_classifier_model')
tokenizer.save_pretrained('/content/email_classifier_model')
print("Model saved successfully!")

# Test the classifier
def classify_email(text, return_confidence=False):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    predicted_class_id = torch.argmax(logits).item()
    confidence = probabilities[0][predicted_class_id].item()

    predicted_label = le.inverse_transform([predicted_class_id])[0]

    if return_confidence:
        return predicted_label, confidence
    return predicted_label

# Test examples
print("\n" + "="*50)
print("TESTING THE EMAIL CLASSIFIER")
print("="*50)

test_examples = [
    "Subject: Software Engineer Interview\nWe would like to schedule an interview.",
    "Team meeting tomorrow at 10 AM in conference room.",
    "Invoice #2024-001 attached. Payment due within 15 days.",
    "WINNER! You've won $50,000! Click this link NOW!",
    "Thank you for your inquiry. We'll respond soon."
]

for i, example in enumerate(test_examples, 1):
    prediction, confidence = classify_email(example, return_confidence=True)
    print(f"\nExample {i}:")
    print(f"Text: {example}")
    print(f"Predicted: {prediction}")
    print(f"Confidence: {confidence:.4f}")

print("\nTraining completed successfully!")


Converting datasets manually...
Train data shape: torch.Size([240, 128])
Val data shape: torch.Size([60, 128])

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6677,0.457629,1.0
2,0.1988,0.152612,1.0



Evaluating model...


Evaluation results:
eval_loss: 0.1526
eval_accuracy: 1.0000
eval_runtime: 29.7226
eval_samples_per_second: 2.0190
eval_steps_per_second: 0.2690
epoch: 2.0000
Model saved successfully!

TESTING THE EMAIL CLASSIFIER

Example 1:
Text: Subject: Software Engineer Interview
We would like to schedule an interview.
Predicted: Job Opportunity
Confidence: 0.8345

Example 2:
Text: Team meeting tomorrow at 10 AM in conference room.
Predicted: Meeting Request
Confidence: 0.8525

Example 3:
Text: Invoice #2024-001 attached. Payment due within 15 days.
Predicted: Invoice
Confidence: 0.9177

Example 4:
Text: WINNER! You've won $50,000! Click this link NOW!
Predicted: Spam
Confidence: 0.5813

Example 5:
Text: Thank you for your inquiry. We'll respond soon.
Predicted: Others
Confidence: 0.8931

Training completed successfully!


In [15]:
# Test individual emails
test_email = "Subject: nWe would like to schedule an meeting for tomorrow."
prediction = classify_email(test_email)
print(f"Predicted category: {prediction}")

# Get prediction with confidence score
prediction, confidence = classify_email(test_email, return_confidence=True)
print(f"Predicted: {prediction}, Confidence: {confidence:.4f}")


Predicted category: Meeting Request
Predicted: Meeting Request, Confidence: 0.3984
