In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from tqdm import tqdm

# Load your dataset
data = pd.read_csv("/content/emailspam.csv")

texts = data['Message'].values
labels = np.where(data['Category'] == 'spam', 1, 0)

# Tokenize the text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_inputs = tokenizer(texts.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)

# Convert labels to tensor
labels = torch.tensor(labels)

print("Tokenized Inputs Shape:", tokenized_inputs['input_ids'].shape)
print("Labels Shape:", labels.shape)

# Split the data
texts_train, texts_val, labels_train, labels_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Convert NumPy array to list of strings
texts_train_list = texts_train.tolist()

# Tokenize the split data
tokenized_train = tokenizer(texts_train_list, return_tensors='pt', padding=True, truncation=True, max_length=512)
tokenized_val = tokenizer(texts_val.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=512)

# Convert labels to tensor
labels_train = torch.tensor(labels_train)
labels_val = torch.tensor(labels_val)

# Convert data to DataLoader
train_data = TensorDataset(tokenized_train['input_ids'], tokenized_train['attention_mask'], labels_train.long())
val_data = TensorDataset(tokenized_val['input_ids'], tokenized_val['attention_mask'], labels_val.long())

train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=8, shuffle=False)

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
epochs = 3

for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc="Training"):
        optimizer.zero_grad()
        inputs, attention_mask, label = [item.to(device) for item in batch]
        outputs = model(inputs, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
val_predictions = []
val_true_labels = []

for batch in tqdm(val_dataloader, desc="Validation"):
    inputs, attention_mask, label = [item.to(device) for item in batch]
    with torch.no_grad():
        outputs = model(inputs, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()
    val_predictions.extend(predictions)
    val_true_labels.extend(label.cpu().numpy())

# Calculate and print accuracy for BERT
accuracy_bert = accuracy_score(val_true_labels, val_predictions)
print('Accuracy (BERT):', accuracy_bert)

# Create a function to visualize model results
def eval(name, y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    t1 = ConfusionMatrixDisplay(cm)
    print(f'Classification Report for {name}: \n')
    print(classification_report(y_test, y_pred))
    t1.plot()

# Visualize model results for BERT
eval('BERT Results', val_true_labels, val_predictions)




Tokenized Inputs Shape: torch.Size([5572, 238])
Labels Shape: torch.Size([5572])


  labels_train = torch.tensor(labels_train)
  labels_val = torch.tensor(labels_val)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training:   0%|          | 1/558 [00:15<2:24:01, 15.51s/it]