In [1]:
!pip install transformers datasets scikit-learn torch pandas evaluate



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import numpy as np
import evaluate
import logging
import os

os.makedirs("./logs", exist_ok=True)


# Configure Python logging to write a plain text file inside ./logs/
logging.basicConfig(
filename="./logs/training.log", 
filemode="w", 
format="%(asctime)s - %(levelname)s - %(message)s",
level=logging.INFO
)


logger = logging.getLogger(__name__)

# --- (Your data loading and tokenizing code would be here) ---

# Define Metrics
metric = evaluate.load("accuracy") # <-- CHANGE 2: New way to load metric

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# --- 1. LOAD YOUR DATASET ---
try:
    df = pd.read_csv('labeled_emails.csv')
    print("File loaded successfully!")
except FileNotFoundError:
    print("Error: Make sure 'labeled_emails.csv' is in the same directory as your notebook, or provide the full path.")
    # Create a dummy dataframe to prevent further errors if the file isn't found
    df = pd.DataFrame({
        'subject': ['test subject 1', 'test subject 2'],
        'from': ['sender@example.com', 'another@example.com'],
        'body': ['This is the body of the first email.', 'This is the second email body.'],
        'label_true_if_any': [0, 1]
    })
    print("A dummy dataframe has been created to allow the code to run.")


# --- 2. SELECT, COMBINE, AND RENAME COLUMNS ---
# Keep only the columns we need
df_filtered = df[['subject', 'from', 'body', 'label_true_if_any']]

# Combine the text columns into a single 'text' column for BERT
df_filtered['text'] = df_filtered['subject'].fillna('') + ' [SEP] ' + df_filtered['from'].fillna('') + ' [SEP] ' + df_filtered['body'].fillna('')

# Rename the label column to 'label', which is expected by the Hugging Face Trainer
df_filtered = df_filtered.rename(columns={'label_true_if_any': 'label'})

# Create a final, clean dataframe with only the 'text' and 'label' columns
final_df = df_filtered[['text', 'label']]

print("\nData after combining and cleaning:")
print(final_df.head())


# --- 3. SPLIT THE DATA (80% train, 10% validation, 10% test) ---
train_df, temp_df = train_test_split(
    final_df,
    test_size=0.2,         
    random_state=42,        
    stratify=final_df['label'] 
)

# Step B: Split the temporary set (20%) into validation (10%) and testing (10%)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,         
    random_state=42,
    stratify=temp_df['label']
)

# --- 4. VERIFY THE SPLIT ---
print(f"\nTraining set size: {len(train_df)} ({len(train_df) / len(final_df):.0%})")
print(f"Validation set size: {len(val_df)} ({len(val_df) / len(final_df):.0%})")
print(f"Testing set size: {len(test_df)} ({len(test_df) / len(final_df):.0%})")

File loaded successfully!

Data after combining and cleaning:
                                                text  label
0  Re: Accurate Background Invites You to Partici...   True
1  Re: Invitation to Walkathon – Swachhata Pakhwa...  False
2  Re: Seven Pages - Cozy Book Reading Session [S...  False
3  Seven Pages - Cozy Book Reading Session [SEP] ...  False
4  Re: Intro to GPU Design!! [SEP] "'Elektronica'...  False

Training set size: 400 (80%)
Validation set size: 50 (10%)
Testing set size: 50 (10%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['text'] = df_filtered['subject'].fillna('') + ' [SEP] ' + df_filtered['from'].fillna('') + ' [SEP] ' + df_filtered['body'].fillna('')


In [4]:
from transformers import AutoTokenizer
from datasets import Dataset

# --- 1. LOAD THE TOKENIZER ---
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# --- 2. CONVERT PANDAS TO HUGGING FACE DATASET OBJECT ---
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# --- 3. CREATE A TOKENIZATION FUNCTION ---
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

# --- 4. APPLY THE TOKENIZER TO ALL DATASETS ---
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

print("\nTokenization complete!")
print("Here's a look at the first training sample after tokenization:")
print(tokenized_train_dataset[0])

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]


Tokenization complete!
Here's a look at the first training sample after tokenization:
{'text': "Internship Opportunity with the Tata Group- Win INR 2.5 lakhs- Apply now! [SEP] Tata Crucible <noreply@dare2compete.news> [SEP] Tata Crucible Campus Quiz 2025 Register Today! \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏ \ufeff ͏

In [5]:
from transformers import AutoModelForSequenceClassification

# Based on your output ('True'/'False'), you have 2 labels.
num_labels = 2 

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',              
    num_train_epochs=5,                  
    per_device_train_batch_size=8,       
    per_device_eval_batch_size=16,       
    warmup_steps=500,                    
    weight_decay=0.01,                  
    logging_dir='./logs',                
    logging_steps=10,
    eval_strategy="epoch",         
    save_strategy="epoch",               
    load_best_model_at_end=True,         
    report_to=["tensorboard"],           
)


In [7]:
import evaluate
import numpy as np

# Load the accuracy metric
metric = evaluate.load("accuracy")

# Define the function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

print("Model, Training Arguments, and Metrics are now defined.")

Model, Training Arguments, and Metrics are now defined.


In [8]:
from transformers import Trainer, TrainerCallback

# ---- Callback that writes logs to ./logs/training.log using your existing `logger` ----
class LogCallback(TrainerCallback):
    """Write Trainer logs (loss, lr, eval metrics) to ./logs/training.log"""
    def on_log(self, args, state, control, logs=None, **kwargs):
        step = getattr(state, "global_step", None)
        if logs:
            # logger comes from the logging setup you added in Step 1
            logger.info(f"Step {step} - {logs}")

# ---- Create the Trainer with the LogCallback attached ----
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[LogCallback]   
)


In [9]:
# Start the training process
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.6039,0.603863,0.72
2,0.4944,0.413679,0.88
3,0.255,0.439562,0.8
4,0.1587,0.453821,0.9
5,0.1401,0.352705,0.94




TrainOutput(global_step=250, training_loss=0.3756684126853943, metrics={'train_runtime': 11101.9371, 'train_samples_per_second': 0.18, 'train_steps_per_second': 0.023, 'total_flos': 526222110720000.0, 'train_loss': 0.3756684126853943, 'epoch': 5.0})

In [14]:
# In train_classifier.ipynb, at the very end:
save_directory = "Classifier Model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('Classifier Model\\tokenizer_config.json',
 'Classifier Model\\special_tokens_map.json',
 'Classifier Model\\vocab.txt',
 'Classifier Model\\added_tokens.json',
 'Classifier Model\\tokenizer.json')

In [11]:
# Run a final evaluation on the test set
final_evaluation_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)

# Print the final results
print("\n--- Final Test Set Performance ---")
print(final_evaluation_results)




--- Final Test Set Performance ---
{'eval_loss': 0.5766546726226807, 'eval_accuracy': 0.9, 'eval_runtime': 42.9622, 'eval_samples_per_second': 1.164, 'eval_steps_per_second': 0.093, 'epoch': 5.0}


In [12]:
# Evaluate at the end and log metrics
metrics = trainer.evaluate()
logger.info(f"Final evaluation metrics: {metrics}")

# Save predictions to ./logs/predictions.csv
preds_output = trainer.predict(tokenized_val_dataset)
logits, labels = preds_output.predictions, preds_output.label_ids
preds = logits.argmax(axis=-1)

import pandas as pd
pd.DataFrame({'label': labels.flatten(), 'pred': preds.flatten()}).to_csv('./logs/predictions.csv', index=False)
logger.info("Saved predictions to ./logs/predictions.csv")




In [13]:
import pandas as pd
import numpy as np

# Use the trainer to get predictions on the test set
predictions_output = trainer.predict(tokenized_test_dataset)

# The raw predictions are logits, so we take the argmax to get the predicted class (0 or 1)
predicted_labels = np.argmax(predictions_output.predictions, axis=-1)

# Get the true labels
true_labels = predictions_output.label_ids

# Create a new DataFrame for easy comparison
results_df = test_df.copy()
results_df['true_label'] = true_labels
results_df['predicted_label'] = predicted_labels

# Save the results to a CSV file
results_df.to_csv('prediction_results.csv', index=False)

print("Saved prediction results to 'prediction_results.csv'")
print(results_df.head())



Saved prediction results to 'prediction_results.csv'
                                                  text  label  true_label  \
92   Fwd: Hackathon Submission Deadline Approaching...   True           1   
399  Re: 📸 Join Camera Handling: 101 – Photography ...  False           0   
234  Re: Implementation of 24×7 Library Access as P...  False           0   
183  New assignment: "Filters" [SEP] "Syed (Classro...   True           1   
304  Fwd: TRIALS FOR INTER IIT LAWN TENNIS [SEP] "'...  False           0   

     predicted_label  
92                 1  
399                0  
234                0  
183                1  
304                0  
