In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import resample
import pandas as pd
import numpy as np
import re
from torchinfo import summary
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/190k-spam-ham-email-dataset-for-classification/spam_Emails_data.csv")
df.head()

In [None]:
def clean_text(text):
    if isinstance(text, str):  # Only process if the input is a string
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
        return text
    return ""  # Return an empty string for non-string values
def encode_values(value):
    if value == 'Spam':
        return 1  # Spam
    else:
        return 0  # Ham
        
df['text'] = df['text'].apply(clean_text)
df['label'] = df['label'].apply(encode_values).astype(int)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
df = df[['text','label']]
df = df.dropna(subset=['label'])
df = df.dropna(subset=['text'])
df.info()

In [None]:
text_lengths = [len(str(text).split()) for text in df['text']]
max_length = max(text_lengths)
min_length = min(text_lengths)
average_length = np.mean(text_lengths)
print(f"Max length: {max_length}")
print(f"Min length: {min_length}")
print(f"Average length: {average_length}")
print()
plt.figure(figsize=(10, 6))
plt.hist(text_lengths, bins=20, color='skyblue', edgecolor='black')
plt.title("Distribution of Text Lengths", fontsize=16)
plt.xlabel("Number of Words", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.axvline(average_length, color='red', linestyle='dashed', linewidth=2, label=f'Average Length: {average_length:.2f}')
plt.axvline(max_length, color='green', linestyle='dashed', linewidth=2, label=f'Max Length: {max_length}')
plt.axvline(min_length, color='purple', linestyle='dashed', linewidth=2, label=f'Min Length: {min_length}')
plt.legend()
plt.show()

In [None]:
all_text = ' '.join(str(text) for text in df['label'])

words = all_text.split()
word_counts = Counter(words)

most_common_words = word_counts.most_common(20)  # Adjust the number of words as needed

# Create a bar chart
words, counts = zip(*most_common_words)
plt.figure(figsize=(12, 6))
plt.bar(words, counts)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Most Common Words in Target Column')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
all_words = [word for text in df['text'] for word in str(text).split()]
word_freq = Counter(all_words)
top_10_words = word_freq.most_common(20)
words, counts = zip(*top_10_words)
plt.figure(figsize=(10, 6))
plt.bar(words, counts, color='skyblue')
plt.title("Top 20 Most Frequent Words", fontsize=16)
plt.xlabel("Words", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.xticks(rotation=45)
plt.show()

In [None]:
all_text = ' '.join(df['text'].tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Text')
plt.show()

In [None]:
for label in df['label'].unique():
    label_text = ' '.join(df[df['label'] == label]['text'].tolist())
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(label_text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    if label == 1:
        plt.title('Most Common Words')
    else:
        plt.title('Most Common Words')
    plt.show()
    print()


In [None]:
def random_oversample(X, y):
    # Combine the features and labels
    df = pd.DataFrame({'text': X, 'label': y})
    # Separate the majority and minority classes
    df_majority = df[df.label == 1]
    df_minority = df[df.label == 0]
    
    # Upsample minority class
    df_minority_upsampled = resample(df_minority,
                                      replace=True,     # sample with replacement
                                      n_samples=len(df_majority),    # to match majority class
                                      random_state=42) # reproducible results

    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])
    
    return df_upsampled['text'].values, df_upsampled['label'].values

In [None]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.texts = dataframe['text'].values
        self.labels = dataframe['label'].values
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = None
        self.counter = 0
        self.early_stop = False
    
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [None]:
def train_model(model, train_dataloader, val_dataloader, test_dataloader, device, epochs=5, lr=1e-5, weight_decay=0.01):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    early_stopping = EarlyStopping(patience=3)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total = 0
        
        for batch in tqdm(train_dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            train_correct += predicted.eq(labels.data).cpu().sum().float()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {train_loss/len(train_dataloader):.4f}, Accuracy: {train_correct/total:.4f}')
        
        # Testing Step
        model.eval()
        test_loss = 0.0
        test_correct = 0
        total = 0
        with torch.no_grad():
            for batch in test_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)

                test_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                test_correct += predicted.eq(labels.data).cpu().sum().float()

        print(f'Test Loss: {test_loss/len(test_dataloader):.4f}, Test Accuracy: {test_correct/total:.4f}')

        # Validation Step
        model.eval()
        val_loss = 0.0
        val_correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                val_correct += predicted.eq(labels.data).cpu().sum().float()

        print(f'Validation Loss: {val_loss/len(val_dataloader):.4f}, Validation Accuracy: {val_correct/total:.4f}')
        early_stopping(val_loss / len(val_dataloader))
        if early_stopping.early_stop:
            print("Early stopping triggered!")
            break


In [None]:
def evaluate_model(model, test_dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Generate and print the classification report
    print("Classification Report on Test Dataset:")
    print(classification_report(all_labels, all_preds, target_names=['Class 0', 'Class 1']))
#     print(classification_report(all_labels, all_preds, target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3']))


In [None]:
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
train_df_X, train_df_y = random_oversample(train_df['text'].values, train_df['label'].values)
train_df = pd.DataFrame({'text': train_df_X, 'label': train_df_y})
print(train_df['label'].value_counts())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class DistilBERT(nn.Module):
    def __init__(self, n_classes):
        super(DistilBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc = nn.Linear(768, n_classes)  # Classification head with n_classes output

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token_state = outputs.last_hidden_state[:, 0, :]  # CLS token for classification
        output = self.fc(cls_token_state)
        return output
# Print the summary
model_DistilBERT = DistilBERT(2).to(device)
summary(model_DistilBERT, input_size=[(1, 256), (1, 256)], dtypes=[torch.long, torch.long])

In [None]:
# Initialize tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Prepare datasets
train_distilbert_text_dataset = TextDataset(train_df, distilbert_tokenizer, max_len=256)
val_distilbert_text_dataset = TextDataset(val_df, distilbert_tokenizer, max_len=256)
test_distilbert_text_dataset = TextDataset(test_df, distilbert_tokenizer, max_len=256)

# Prepare dataloaders
train_distilbert_text_dataloader = DataLoader(train_distilbert_text_dataset, batch_size=16, shuffle=True)
val_distilbert_text_dataloader = DataLoader(val_distilbert_text_dataset, batch_size=16, shuffle=False)
test_distilbert_text_dataloader = DataLoader(test_distilbert_text_dataset, batch_size=16, shuffle=False)


In [None]:
train_model(
    model_DistilBERT,  # Model instance
    train_distilbert_text_dataloader,  # Training dataloader
    val_distilbert_text_dataloader,    # Validation dataloader
    test_distilbert_text_dataloader,   # Test dataloader
    device, 
    epochs=100
)

In [None]:
evaluate_model(model_DistilBERT, test_distilbert_text_dataloader, device)