In [None]:
import pandas as pd
import numpy as np
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


# Load a dataset

In [None]:
df = pd.read_csv("spam.csv", encoding="ISO-8859-1")

In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df = df.rename(columns={"v1":"target", "v2":"email"})[["email", "target"]]

In [None]:
df.head()

Unnamed: 0,email,target
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


# Text Cleaning Step

In [None]:
import re
def cleaning_text(str_):
    str_ = str_.lower()
    str_ = re.sub(r"[^a-z0-9]", " ", str_)
    return str_ # ignore special char

In [None]:
df["clean_email"] = df["email"].apply(cleaning_text)

In [None]:
df.head()

Unnamed: 0,email,target,clean_email
0,"Go until jurong point, crazy.. Available only ...",ham,go until jurong point crazy available only ...
1,Ok lar... Joking wif u oni...,ham,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry in 2 a wkly comp to win fa cup fina...
3,U dun say so early hor... U c already then say...,ham,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",ham,nah i don t think he goes to usf he lives aro...


# Tokenisation

In [None]:
sentences = df["clean_email"].to_list()

In [None]:
tokens = []
for sentence in sentences:
    tokens.append(sentence.split(" "))

# Word2Vec on the text

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(tokens, min_count=1, vector_size=100)

In [None]:
vocab = model.wv.index_to_key

# create tokens for all email text

In [None]:
sent_vec = []

for sent in sentences:
    temp_vec = []

    for word in sent.split(" "):
        temp_vec.append(model.wv[word])

    sent_vec.append(np.array(temp_vec).mean(axis=0))

In [None]:
df["vectors"] = sent_vec

In [None]:
df.head()

Unnamed: 0,email,target,clean_email,vectors
0,"Go until jurong point, crazy.. Available only ...",ham,go until jurong point crazy available only ...,"[-0.121883824, 0.27640355, -0.022398645, 0.259..."
1,Ok lar... Joking wif u oni...,ham,ok lar joking wif u oni,"[-0.12388701, 0.3092525, 0.020951023, 0.400930..."
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry in 2 a wkly comp to win fa cup fina...,"[-0.116499566, 0.2428406, -0.09013266, 0.04490..."
3,U dun say so early hor... U c already then say...,ham,u dun say so early hor u c already then say,"[-0.14996241, 0.34424996, -0.035963364, 0.2941..."
4,"Nah I don't think he goes to usf, he lives aro...",ham,nah i don t think he goes to usf he lives aro...,"[-0.14659037, 0.28100678, -0.07824696, 0.05555..."


# encode target col

In [None]:
df["target"] = df['target'].astype('category').cat.codes

In [None]:
df.head()

Unnamed: 0,email,target,clean_email,vectors
0,"Go until jurong point, crazy.. Available only ...",0,go until jurong point crazy available only ...,"[-0.121883824, 0.27640355, -0.022398645, 0.259..."
1,Ok lar... Joking wif u oni...,0,ok lar joking wif u oni,"[-0.12388701, 0.3092525, 0.020951023, 0.400930..."
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry in 2 a wkly comp to win fa cup fina...,"[-0.116499566, 0.2428406, -0.09013266, 0.04490..."
3,U dun say so early hor... U c already then say...,0,u dun say so early hor u c already then say,"[-0.14996241, 0.34424996, -0.035963364, 0.2941..."
4,"Nah I don't think he goes to usf, he lives aro...",0,nah i don t think he goes to usf he lives aro...,"[-0.14659037, 0.28100678, -0.07824696, 0.05555..."


# Random Forest Classifier ML model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
x = np.array(sent_vec)
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
y_pred = rf_model.predict(x_test)

In [None]:
accuracy_score(y_pred, y_test)

0.9545454545454546

In [None]:
random_mail = "free free free, hey apply for this credit card and earn 1 million !"

In [None]:
random_clean_email = cleaning_text(random_mail)
random_clean_email

'free free free  hey apply for this credit card and earn 1 million  '

In [None]:
random_temp_vec = []
for word in random_clean_email.split(" "):
    random_temp_vec.append(model.wv[word])

arr_temp = np.array(random_temp_vec).mean(axis=0)

In [None]:
rf_model.predict([arr_temp]) # out == 1, it's spam mail

array([1], dtype=int8)

# NN based Classifier

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

### create custom Dataset so make Class

In [None]:
# custom Dataset Class
class SpamDataset(Dataset):

    def __init__(self, feature, label):
        self.feature = torch.tensor(feature)
        self.label = torch.tensor(label)

    def __len__(self):
        return len(self.feature)

    def __getitem__(self, index):
        return self.feature[index], self.label[index]

In [None]:
train_dataset = SpamDataset(x_train, np.array(y_train))
test_dataset = SpamDataset(x_test, np.array(y_test))

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

### create NN model

In [None]:
class SpamClassNNModel(nn.Module):
    def __init__(self):
        super(SpamClassNNModel, self).__init__()

        self.fc1 = nn.Linear(100, 200)
        self.fc2 = nn.Linear(200, 100)
        self.fc3 = nn.Linear(100, 50)
        self.fc4 = nn.Linear(50, 10)
        self.out = nn.Linear(10, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        out = self.out(x) # No softmax use bcz BCEWithLogitsLoss internally handles that.

        return out

In [None]:
nn_model = SpamClassNNModel().to(device)
print(nn_model)

SpamClassNNModel(
  (fc1): Linear(in_features=100, out_features=200, bias=True)
  (fc2): Linear(in_features=200, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=50, bias=True)
  (fc4): Linear(in_features=50, out_features=10, bias=True)
  (out): Linear(in_features=10, out_features=1, bias=True)
  (relu): ReLU()
)


In [None]:
# CrossEntropyLoss is for Multi-Class Classification --> self.out = nn.Linear(10, 2)
criterion = nn.BCEWithLogitsLoss() # Binary Cross Entropy Loss with Logits
optimizer = torch.optim.Adam(nn_model.parameters(), lr=0.001)

### train nn model

In [None]:
nn_model.train()

for epoc in range(50):
    total_loss = 0

    for input, label in train_loader:
        input = input.to(device)
        label = label.to(device).float().unsqueeze(1) # Converts to float and reshapes them to match

        output = nn_model(input)
        optimizer.zero_grad()
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"epoc: {epoc+1}/50, total loss is: {total_loss}")

epoc: 1/20, total loss is: 49.65978699922562
epoc: 2/20, total loss is: 40.87595810741186
epoc: 3/20, total loss is: 29.006435222923756
epoc: 4/20, total loss is: 26.739424616098404
epoc: 5/20, total loss is: 26.11759103089571
epoc: 6/20, total loss is: 26.18472457677126
epoc: 7/20, total loss is: 25.51086250692606
epoc: 8/20, total loss is: 25.72706627100706
epoc: 9/20, total loss is: 25.650180645287037
epoc: 10/20, total loss is: 24.524372667074203
epoc: 11/20, total loss is: 25.360825143754482
epoc: 12/20, total loss is: 25.419970996677876
epoc: 13/20, total loss is: 24.45296859368682
epoc: 14/20, total loss is: 24.27509067580104
epoc: 15/20, total loss is: 24.00510096922517
epoc: 16/20, total loss is: 24.1177931278944
epoc: 17/20, total loss is: 24.451640129089355
epoc: 18/20, total loss is: 23.039819102734327
epoc: 19/20, total loss is: 22.92173257470131
epoc: 20/20, total loss is: 24.442713923752308
epoc: 21/20, total loss is: 24.277366876602173
epoc: 22/20, total loss is: 22.949

### test nn model

In [None]:
nn_model.eval()
avg_acc = []

with torch.no_grad():

    for inp, lab in test_loader:
        inp = inp.to(device)
        lab = lab.to(device)

        correct = 0
        total = lab.size(0)

        test_output = nn_model(inp)
        pred_lab = torch.argmax(test_output, dim=1)
        correct += (pred_lab == lab).sum().item()
        acc = correct/total*100
        avg_acc.append(acc)

    print(f"model accuracy is: {np.array(avg_acc).mean():0.2f}%")

model accuracy is: 86.73%
