In [77]:
# Load the dataset (https://www.kaggle.com/chandramoulinaidu/spam-classification-for-basic-nlp)

import pandas as pd

df = pd.read_csv('Spam Email raw text for NLP.csv')

df.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [78]:
df.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0
5795,0,"If you run Pick, and then use the ""New FTOC"" b...",00747.352d424267d36975a7b40b85ffd0885e


In [79]:
df['CATEGORY'].value_counts()

CATEGORY
0    3900
1    1896
Name: count, dtype: int64

In [80]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fuadn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fuadn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [81]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
test_message = "Hey,, GGggGG feet it going? <HTML><bads> bads 'randoms' badly"

test_message_tokenized = tokenizer.tokenize(test_message)
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [82]:
test_message_lowercased = [t.lower() for t in test_message_tokenized]
test_message_lowercased

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [83]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

test_message_lemmatized_tokens = [lemmatizer.lemmatize(t) for t in test_message_lowercased]
test_message_lemmatized_tokens

['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [84]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

test_message_useful_tokens = [t for t in test_message_lemmatized_tokens if t not in stopwords]
test_message_useful_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [85]:
def message_to_token_list(s):
  tokens = tokenizer.tokenize(s)
  lowercased_tokens = [t.lower() for t in tokens]
  lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
  useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

  return useful_tokens

message_to_token_list(test_message)

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [86]:
df = df.sample(frac=1, random_state=1)
df = df.reset_index(drop=True)

split_index = int(len(df) * 0.8)
train_df, test_df = df[:split_index], df[split_index:]

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df, test_df

(      CATEGORY                                            MESSAGE  \
 0            1  \n\n<HTML><FONT  BACK="#ffffff" style="BACKGRO...   
 1            1  <html><body bgColor="#CCCCCC" topmargin=1 onMo...   
 2            0  Quoting Paul Linehan (plinehan@yahoo.com):\n\n...   
 3            0  <a href=http://www.aaronsw.com/weblog/>\n\nAar...   
 4            0  Oh yeah, the link for more info:\n\n\n\nhttp:/...   
 ...        ...                                                ...   
 4631         0  Gregory Alan Bolcer:\n\n>I'm not sure since I ...   
 4632         1  New Account For: zzzz@spamassassin.taint.org\n...   
 4633         0  >>>>> "O" == Owen Byrne <owen@permafrost.net> ...   
 4634         0  This is an automated response to a message you...   
 4635         0  http://www.ouchytheclown.com/welcome.html\n\n\...   
 
                                    FILE_NAME  
 0     00118.141d803810acd9d4fc23db103dddfcd9  
 1     00463.0bc4e08af0529dd773d9f10f922547db  
 2     00358.8

In [87]:
token_counter = {}

for message in train_df['MESSAGE']:
  message_as_token_lst = message_to_token_list(message)

  for token in message_as_token_lst:
    if token in token_counter:
      token_counter[token] += 1
    else:
      token_counter[token] = 1

len(token_counter)

86439

In [88]:
token_counter

{'html': 4175,
 'font': 35005,
 'back': 1055,
 'ffffff': 2535,
 'style': 3349,
 'background': 789,
 'color': 9642,
 'size': 13107,
 '3': 3581,
 'ptsize': 450,
 '12': 985,
 'b': 12856,
 'viagra': 66,
 '000000': 1923,
 '2': 7993,
 '10': 2182,
 'family': 1491,
 'sansserif': 314,
 'face': 9950,
 'arial': 6187,
 'lang': 419,
 '0': 9445,
 'br': 16013,
 'breakthrough': 22,
 'medication': 50,
 'impotence': 13,
 'delivered': 79,
 'mailbox': 71,
 'without': 658,
 'leaving': 50,
 'computer': 640,
 'simply': 377,
 'click': 2144,
 'href': 3875,
 'http': 14926,
 'host': 158,
 '1bulk': 12,
 'email': 4015,
 'software': 1129,
 'com': 11675,
 'ch4': 12,
 'pharm': 12,
 'blue': 181,
 'less': 473,
 '5': 2932,
 'minute': 366,
 'complete': 403,
 'line': 1307,
 'consultation': 68,
 'many': 1004,
 'case': 681,
 '24': 575,
 'nbsp': 9732,
 'hour': 589,
 'gt': 108,
 'website': 488,
 'treatment': 33,
 'compromised': 12,
 'sexual': 120,
 'function': 202,
 'convenient': 36,
 'affordable': 55,
 'confidential': 135,
 

In [89]:
def keep_token(proccessed_token, threshold):
  if proccessed_token not in token_counter:
    return False
  else:
    return token_counter[proccessed_token] > threshold

keep_token('random', 100)

False

In [90]:
features = set()

for token in token_counter:
  if keep_token(token, 1000):
    features.add(token)

features

{'0',
 '00',
 '000',
 '000000',
 '1',
 '10',
 '100',
 '15',
 '2',
 '20',
 '2002',
 '22',
 '2e',
 '3',
 '30',
 '3c',
 '3d',
 '3d2',
 '3e',
 '4',
 '5',
 '50',
 '6',
 '7',
 '8',
 '_______________________________________________',
 'address',
 'align',
 'also',
 'arial',
 'b',
 'back',
 'bgcolor',
 'blockquote',
 'body',
 'border',
 'br',
 'business',
 'c',
 'cellpadding',
 'cellspacing',
 'center',
 'change',
 'click',
 'color',
 'colspan',
 'com',
 'company',
 'content',
 'could',
 'day',
 'div',
 'doe',
 'e',
 'email',
 'even',
 'exmh',
 'face',
 'family',
 'ff0000',
 'ffffff',
 'file',
 'first',
 'font',
 'form',
 'free',
 'get',
 'gif',
 'go',
 'group',
 'ha',
 'head',
 'height',
 'helvetica',
 'home',
 'href',
 'html',
 'http',
 'ie',
 'ilug',
 'image',
 'img',
 'information',
 'input',
 'internet',
 'know',
 'left',
 'li',
 'like',
 'line',
 'link',
 'linux',
 'list',
 'listinfo',
 'mail',
 'mailing',
 'mailman',
 'mailto',
 'make',
 'many',
 'margin',
 'may',
 'message',
 'money',


In [91]:
features = list(features)
features

['nbsp',
 'mailto',
 'td',
 'content',
 'org',
 'way',
 'face',
 '2002',
 '2e',
 'site',
 'cellpadding',
 'src',
 'would',
 'internet',
 'wrote',
 'html',
 'ilug',
 'text',
 'state',
 'system',
 '_______________________________________________',
 'value',
 'body',
 'message',
 'list',
 'back',
 'form',
 'c',
 '5',
 'email',
 '3d',
 '000000',
 'net',
 'home',
 'day',
 'href',
 'option',
 'service',
 'ffffff',
 '7',
 'ha',
 'input',
 '10',
 'using',
 'arial',
 '20',
 'height',
 'colspan',
 'need',
 'think',
 'link',
 'rpm',
 '15',
 'make',
 '30',
 'top',
 'phone',
 'company',
 'click',
 'ff0000',
 'e',
 'free',
 '8',
 'u',
 'please',
 'com',
 'world',
 'know',
 '22',
 'could',
 'span',
 'money',
 'style',
 'also',
 'group',
 '2',
 'want',
 'li',
 'may',
 'send',
 'get',
 'image',
 '3c',
 'user',
 'p',
 'file',
 'server',
 'people',
 '3e',
 '3',
 '000',
 'product',
 'size',
 'br',
 'gif',
 'receive',
 'software',
 'work',
 'exmh',
 'offer',
 'align',
 'much',
 '6',
 'r',
 '00',
 'change',

In [92]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

{'nbsp': 0,
 'mailto': 1,
 'td': 2,
 'content': 3,
 'org': 4,
 'way': 5,
 'face': 6,
 '2002': 7,
 '2e': 8,
 'site': 9,
 'cellpadding': 10,
 'src': 11,
 'would': 12,
 'internet': 13,
 'wrote': 14,
 'html': 15,
 'ilug': 16,
 'text': 17,
 'state': 18,
 'system': 19,
 '_______________________________________________': 20,
 'value': 21,
 'body': 22,
 'message': 23,
 'list': 24,
 'back': 25,
 'form': 26,
 'c': 27,
 '5': 28,
 'email': 29,
 '3d': 30,
 '000000': 31,
 'net': 32,
 'home': 33,
 'day': 34,
 'href': 35,
 'option': 36,
 'service': 37,
 'ffffff': 38,
 '7': 39,
 'ha': 40,
 'input': 41,
 '10': 42,
 'using': 43,
 'arial': 44,
 '20': 45,
 'height': 46,
 'colspan': 47,
 'need': 48,
 'think': 49,
 'link': 50,
 'rpm': 51,
 '15': 52,
 'make': 53,
 '30': 54,
 'top': 55,
 'phone': 56,
 'company': 57,
 'click': 58,
 'ff0000': 59,
 'e': 60,
 'free': 61,
 '8': 62,
 'u': 63,
 'please': 64,
 'com': 65,
 'world': 66,
 'know': 67,
 '22': 68,
 'could': 69,
 'span': 70,
 'money': 71,
 'style': 72,
 'als

In [93]:
message_to_token_list('3d b <br> .com bad font font com randoms')

['3d', 'b', 'br', 'com', 'bad', 'font', 'font', 'com', 'randoms']

In [94]:
# "Bag of Words" (counts vector)

# ->  http  tr  size  3d  font  br  com  td   p   b
# ->    0    1    2    3   4    5    6    7   8   9
# ->   [0,   0,   0,   1,  2,   1,   2,   0,  0,  1]

[0.,  0.,  0.,   1., 2.,  1., 2.,  0., 0., 1.]

[0.0, 0.0, 0.0, 1.0, 2.0, 1.0, 2.0, 0.0, 0.0, 1.0]

In [95]:
import numpy as np

def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens = message_to_token_list(message)

  for token in processed_list_of_tokens:
    if token not in features:
      continue
    index = token_to_index_mapping[token]
    count_vector[index] += 1

  return count_vector

message_to_count_vector('3d b <br> .com bad font font com randoms')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])

In [96]:
message_to_count_vector(train_df['MESSAGE'].iloc[3])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 2., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])

In [97]:
train_df.iloc[3]

CATEGORY                                                     0
MESSAGE      <a href=http://www.aaronsw.com/weblog/>\n\nAar...
FILE_NAME               01274.0d083a2d3b30061efdc2cc73ee9e76e3
Name: 3, dtype: object

In [98]:
def df_to_X_y(dff):
  y = dff['CATEGORY'].to_numpy().astype(int)

  message_col = dff['MESSAGE']
  count_vectors = []

  for message in message_col:
    count_vector = message_to_count_vector(message)
    count_vectors.append(count_vector)

  X = np.array(count_vectors).astype(int)

  return X, y

In [99]:
X_train, y_train = df_to_X_y(train_df)

X_test, y_test = df_to_X_y(test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4636, 172), (4636,), (1160, 172), (1160,))

In [100]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

X_train

array([[0.00176367, 0.05454545, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.01346801, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.02040816,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.02040816,
        0.        ]])

In [108]:
# **TF-IDF Feature Extraction**

from sklearn.feature_extraction.text import TfidfVectorizer

def custom_tokenizer(message):
    return message_to_token_list(message)

tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['MESSAGE'])

X_test_tfidf = tfidf_vectorizer.transform(test_df['MESSAGE'])

y_train_tfidf = train_df['CATEGORY'].to_numpy().astype(int)
y_test_tfidf = test_df['CATEGORY'].to_numpy().astype(int)


feature_names = tfidf_vectorizer.get_feature_names_out()

X_train_tfidf_dense = X_train_tfidf.toarray()

# View the dense matrix (first 5 rows, for example)
print(X_train_tfidf_dense[:1])



[[0.01516318 0.         0.         ... 0.         0.         0.        ]]


#### Bag of Words

In [102]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))


              precision    recall  f1-score   support

           0       0.88      0.99      0.93       788
           1       0.98      0.72      0.83       372

    accuracy                           0.91      1160
   macro avg       0.93      0.86      0.88      1160
weighted avg       0.91      0.91      0.90      1160



In [103]:
# Compare logistic regression to random forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       788
           1       0.99      0.95      0.97       372

    accuracy                           0.98      1160
   macro avg       0.98      0.97      0.98      1160
weighted avg       0.98      0.98      0.98      1160



#### TF_IDF EXTRACTION

In [104]:
# **Logistic Regression Training and Evaluation**

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Training Logistic Regression
print("Training Logistic Regression...")
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train_tfidf)

# Predictions and evaluation
lr_predictions = lr_model.predict(X_test_tfidf)
print("Logistic Regression Classification Report:\n")
print(classification_report(y_test_tfidf, lr_predictions))


Training Logistic Regression...
Logistic Regression Classification Report:

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       788
           1       0.99      0.95      0.97       372

    accuracy                           0.98      1160
   macro avg       0.98      0.97      0.98      1160
weighted avg       0.98      0.98      0.98      1160



In [105]:
# **Random Forest Training and Evaluation**

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Training Random Forest
print("Training Random Forest...")
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train_tfidf)

# Predictions and evaluation
rf_predictions = rf_model.predict(X_test_tfidf)
print("Random Forest Classification Report:\n")
print(classification_report(y_test_tfidf, rf_predictions))


Training Random Forest...
Random Forest Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       788
           1       0.99      0.95      0.97       372

    accuracy                           0.98      1160
   macro avg       0.98      0.97      0.98      1160
weighted avg       0.98      0.98      0.98      1160

