<a href="https://colab.research.google.com/github/Melancholy22/Scam-Detector-NLP/blob/main/Scam_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is where we upload the data set and import it in the Pandas data frame

In [None]:
import pandas as pd
import csv
import sys

# Step 1: Increase CSV field size limit to handle long email content
csv.field_size_limit(sys.maxsize)

df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ScamDetector/Phishing_Email.csv", engine='python')

# Drop the 'Unnamed: 0' columns and drop any rows with missing values
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])
df = df.dropna(subset=['Email Text', 'Email Type'])

# Remove trailing and leading whitespace & Converts all characters to lowercase
df['Email Text'] = df['Email Text'].str.strip()
df['Email Type'] = df['Email Type'].str.strip().str.lower()

# Get rid of rows with invalid labels
valid_labels = ['phishing email', 'safe email']
df = df[df['Email Type'].isin(valid_labels)]

# Reset index
df = df.reset_index(drop=True)

# Print summary
print(f"✅ Cleaned dataset: {len(df)} rows, {df.shape[1]} columns")
print(df['Email Type'].value_counts())

✅ Cleaned dataset: 18634 rows, 2 columns
Email Type
safe email        11322
phishing email     7312
Name: count, dtype: int64


Text Pre-processing

In [None]:
import pandas as pd

# Drop missing rows (just in case)
df.dropna(subset=['Email Text', 'Email Type'], inplace=True)

# Strip whitespace and lowercase labels
df['Email Text'] = df['Email Text'].str.strip()
df['Email Type'] = df['Email Type'].str.strip().str.lower()

# Create binary label: 1 = phishing, 0 = safe
df['label'] = df['Email Type'].apply(lambda x: 1 if x == 'phishing email' else 0)

print(df.head())

                                          Email Text      Email Type  label
0  re : 6 . 1100 , disc : uniformitarianism , re ...      safe email      0
1  the other side of * galicismos * * galicismo *...      safe email      0
2  re : equistar deal tickets are you still avail...      safe email      0
3  Hello I am your hot lil horny toy.\n    I am t...  phishing email      1
4  software at incredibly low prices ( 86 % lower...  phishing email      1


Tokenizer: Turns the data into a readable form for the computer

In [None]:
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.auto import tqdm


# Set a random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [None]:
class ScamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=100):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        #Tokenize text using the tokenizer from HuggingFace
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        # Extract the input_ids (token ids) from the encoding
        token_ids = encoding['input_ids'].squeeze()

        return {
            'text': token_ids,
            'label': torch.tensor(label, dtype=torch.float)
        }

[RNN Classifier](https://www.analyticsvidhya.com/blog/2019/01/sequence-models-deeplearning/):

*   Embedding dimension: Vector representation of words (or tokens) in continuous space. They captyure the meaning of words ina  way the model can understand
 *   This is what we are updating every pass
*   Hidden size: Controls model's capacity (more neurons = more powerful)
*   Number of layers: Adds depth and abstraction (typically 1-2)
*   Bidirectional: Allows models to view context from past and future
*   Batch first: Confirms that input/output tensors are in (batch, seq, feature) format
*   Dropout: Regularization to prevent overfitting. Basically it randomly drops a fraction of the neurons during training in order to force the network to learn redundant representations. This way the model does memorize the training data and overfit.

In [None]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes, num_layers, dropout=0.5, pretrained=None):
        super(RNNClassifier, self).__init__()
        if pretrained is not None:
            print("Using pretrained models")
            self.embedding = nn.Embedding.from_pretrained(pretrained, freeze=False) # Turn freeze=True if you dont want to update embeddings
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # We will be using the Long Short-Term Memory (LSTM) RNN
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.fc = nn.Linear(hidden_size * 2, num_classes) # We are times it by 2 due to bidirectionality
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Converts token ids into word vectors
        embedded = self.embedding(x)

        # Pass through bidirection LSTM
        output, (hidden, cell) = self.rnn(embedded) # Shape: (batch_size, seq_len, hidden_size * 2)

        # Concatenate the final hidden states in both direction
        # -2 = last forward layer, -1 = last backward layer
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)

        # Apply dropout and final linear layer
        hidden = self.dropout(hidden)
        out = self.fc(hidden) # Final prediction

        return out

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


Training and validation

In [None]:
from tqdm.auto import tqdm

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        # -----------------------
        # Training Phase
        # -----------------------
        model.train()
        train_loss = 0.0

        train_progress = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} [Train]')
        for batch in train_progress:
            texts = batch['text'].to(device)
            labels = batch['label'].to(device)

            outputs = model(texts)
            loss = criterion(outputs.squeeze(), labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_progress.set_postfix(loss=f'{loss.item():.4f}')

        # -----------------------
        # Validation Phase
        # -----------------------
        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            val_progress = tqdm(val_loader, desc=f'Epoch {epoch+1}/{num_epochs} [Valid]')
            for batch in val_progress:
                texts = batch['text'].to(device)
                labels = batch['label'].to(device)

                outputs = model(texts)
                loss = criterion(outputs.squeeze(), labels)
                val_loss += loss.item()
                val_progress.set_postfix(loss=f'{loss.item():.4f}')

        # -----------------------
        # Log Average Losses
        # -----------------------
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")


Run the model

In [None]:
def main():
    # Load the tokenizer
    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer = AutoTokenizer.from_pretrained('Snowflake/snowflake-arctic-embed-s')

    # Hyperparameters
    batch_size = 32
    vocab_size = tokenizer.vocab_size
    embedding_dim = 128
    hidden_size = 64
    num_classes = 1
    num_layers = 2
    dropout = 0.5
    epochs = 100
    learning_rate = 0.001

    #Prepare Datasets
    train_dataset = ScamDataset(df['Email Text'].tolist(), df['label'].tolist(), tokenizer)
    val_dataset = ScamDataset(df['Email Text'].tolist(), df['label'].tolist(), tokenizer)

    # Data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Initialize Model
    model = RNNClassifier(vocab_size, embedding_dim, hidden_size, num_classes, num_layers, dropout)
    model = model.to(device)

    # Loss function
    criterion = nn.BCEWithLogitsLoss()

    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    print("Training the model")
    train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=epochs)

    # Save the model
    torch.save(model.state_dict(), 'tos_classifier_model.pt')
    print("\nModel saved to 'tos_classifier_model.pt'")


if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Training the model


Epoch 1/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 1/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [1/100] - Train Loss: 0.3202 - Val Loss: 0.1618


Epoch 2/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 2/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [2/100] - Train Loss: 0.1339 - Val Loss: 0.0822


Epoch 3/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 3/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [3/100] - Train Loss: 0.0762 - Val Loss: 0.0465


Epoch 4/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 4/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [4/100] - Train Loss: 0.0565 - Val Loss: 0.0367


Epoch 5/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 5/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [5/100] - Train Loss: 0.0426 - Val Loss: 0.0298


Epoch 6/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 6/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [6/100] - Train Loss: 0.0373 - Val Loss: 0.0290


Epoch 7/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 7/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [7/100] - Train Loss: 0.0334 - Val Loss: 0.0285


Epoch 8/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 8/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [8/100] - Train Loss: 0.0299 - Val Loss: 0.0250


Epoch 9/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 9/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [9/100] - Train Loss: 0.0365 - Val Loss: 0.0283


Epoch 10/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 10/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [10/100] - Train Loss: 0.0312 - Val Loss: 0.0255


Epoch 11/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 11/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [11/100] - Train Loss: 0.0262 - Val Loss: 0.0740


Epoch 12/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 12/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [12/100] - Train Loss: 0.0325 - Val Loss: 0.0239


Epoch 13/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 13/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [13/100] - Train Loss: 0.0237 - Val Loss: 0.0212


Epoch 14/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 14/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [14/100] - Train Loss: 0.0249 - Val Loss: 0.0228


Epoch 15/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 15/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [15/100] - Train Loss: 0.0249 - Val Loss: 0.0225


Epoch 16/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 16/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [16/100] - Train Loss: 0.0234 - Val Loss: 0.0212


Epoch 17/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 17/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [17/100] - Train Loss: 0.0215 - Val Loss: 0.0205


Epoch 18/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch 18/100 [Valid]:   0%|          | 0/583 [00:00<?, ?it/s]

Epoch [18/100] - Train Loss: 0.0210 - Val Loss: 0.0201


Epoch 19/100 [Train]:   0%|          | 0/583 [00:00<?, ?it/s]

Try it yourself

In [None]:
def predict_message(model, tokenizer, text, device, max_length=512):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(device)
        output = model(input_ids).squeeze()
        prob = torch.sigmoid(output).item()
        return prob

def test_model():
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained('Snowflake/snowflake-arctic-embed-s')

    # Model parameters (must match training)
    vocab_size = tokenizer.vocab_size
    embedding_dim = 128
    hidden_size = 64
    num_classes = 1
    num_layers = 2
    dropout = 0.5

    # Initialize model
    model = RNNClassifier(vocab_size, embedding_dim, hidden_size, num_classes, num_layers, dropout)
    model.load_state_dict(torch.load('tos_classifier_model.pt', map_location=device))
    model = model.to(device)

    while True:
        user_input = input("\nPaste an scam message (or type 'quit' to exit):\n> ")
        if user_input.lower() == 'quit':
            break
        score = predict_message(model, tokenizer, user_input, device)
        if score > 0.5:
            print(f"⚠️ Scam Likely! Confidence: {score:.2%}")
        else:
            print(f"✅ Looks Safe. Confidence: {100 - score * 100:.2f}%")

test_model()