In [1]:
import pandas as pd

df = pd.read_csv('Spam Email raw text for NLP.csv')
df.head()


Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [2]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to C:\Users\Adhik
[nltk_data]     Puthenkattil\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to C:\Users\Adhik
[nltk_data]     Puthenkattil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
test_message = "Hey,, GGggGG feet it going? <HTML><bads> bads 'randoms' badly"
test_message_tokenized = tokenizer.tokenize(test_message)
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [4]:
test_message_lowercased = [t.lower() for t in test_message_tokenized]
test_message_lowercased

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [8]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
test_message_lemmatized_tokens = [lemmatizer.lemmatize(t) for t in test_message_lowercased]
test_message_lemmatized_tokens

['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [9]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
test_message_useful_tokens = [t for t in test_message_lemmatized_tokens if t not in stopwords]
test_message_useful_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [19]:
def message_to_token_list(s):
    tokens = tokenizer.tokenize(s)
    lowercased_tokens = [t.lower() for t in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
    useful_tokens = [t for t  in lemmatized_tokens if t not in stopwords]
    return useful_tokens

In [13]:
df = df.sample(frac=1, random_state=1)
df = df.reset_index(drop=True) 

split_index = int(len(df) * 0.8)
train_df, test_df = df[:split_index], df[split_index:]
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
train_df, test_df

(      CATEGORY                                            MESSAGE  \
 0            1  IMPORTANT INFORMATION:\n\n\n\nThe new domain n...   
 1            1  ------=_NextPart_000_00C6_24D75C3A.C0588C07\n\...   
 2            0  Any plans for rolling Nessus RPMs for RH8?  Mi...   
 3            0  \n\n\n\nHijacker High (8/30) \n\nDalal Mughrab...   
 4            0  On Mon, Nov 25, 2002 at 06:54:49PM +0000, Phil...   
 ...        ...                                                ...   
 4631         1  Get your favorite Poker action at http://www.m...   
 4632         0  \n\n----- Original Message -----\n\nFrom: "Kra...   
 4633         0  use Perl Daily Newsletter\n\n\n\nIn this issue...   
 4634         0  \n\n--]> A Green once said that if the Spotted...   
 4635         1  <html>\n\n<head>\n\n<title>FREESTORES - MAKE M...   
 
                                    FILE_NAME  
 0     00416.7fa9ccac275fe2d97517554ecde57fbe  
 1     00370.549e569ab1b84fb13a4ea7d61f98f86d  
 2     01280.0

In [20]:
token_counter = {}

for message in train_df['MESSAGE']:
    message_as_token_list = message_to_token_list(message)
    for token in message_as_token_list:
        if token in token_counter:
            token_counter[token] += 1
        else:
            token_counter[token] = 1
len(token_counter)
    

85600

In [21]:
token_counter

{'important': 309,
 'information': 2096,
 'new': 2735,
 'domain': 274,
 'name': 3612,
 'finally': 171,
 'available': 731,
 'general': 362,
 'public': 573,
 'discount': 78,
 'price': 709,
 'register': 156,
 'one': 3405,
 'exciting': 61,
 'biz': 130,
 'info': 618,
 'well': 997,
 'original': 461,
 'com': 12483,
 'net': 5435,
 '14': 512,
 '95': 433,
 'brand': 106,
 'extension': 39,
 'recently': 178,
 'approved': 103,
 'icann': 11,
 'right': 2348,
 'biggest': 72,
 'benefit': 299,
 'course': 447,
 'currently': 291,
 'e': 2822,
 'much': 1045,
 'easier': 130,
 'attractive': 29,
 'easy': 601,
 'remember': 318,
 'visit': 411,
 'http': 15773,
 'www': 8003,
 'affordable': 48,
 'today': 854,
 'registration': 50,
 'fee': 183,
 'include': 429,
 'full': 434,
 'access': 533,
 'use': 2265,
 'control': 327,
 'panel': 75,
 'manage': 90,
 'future': 565,
 'sincerely': 98,
 'administrator': 48,
 'remove': 1035,
 'email': 3835,
 'address': 1751,
 'promotional': 16,
 'mailing': 2147,
 'company': 1441,
 'click'

In [24]:
def keep_token(processed_token, threshold):
    if processed_token not in token_counter:
        return False
    else:
        return token_counter[processed_token] > threshold
keep_token('random', 10)

True

In [38]:
features = set()
for token in token_counter:
    if keep_token(token, 7000):
        features.add(token)
features

{'0',
 '1',
 '2',
 '3d',
 'b',
 'br',
 'color',
 'com',
 'face',
 'font',
 'http',
 'list',
 'nbsp',
 'p',
 'size',
 'td',
 'tr',
 'width',
 'www'}

In [39]:
features = list(features)
features

['nbsp',
 'com',
 'http',
 'face',
 'width',
 'www',
 '0',
 'tr',
 '2',
 'p',
 'list',
 'size',
 'color',
 'b',
 'br',
 '3d',
 'td',
 'font',
 '1']

In [40]:
token_to_index_mapping = {t:i for t, i in  zip(features, range(len(features)))}
token_to_index_mapping

{'nbsp': 0,
 'com': 1,
 'http': 2,
 'face': 3,
 'width': 4,
 'www': 5,
 '0': 6,
 'tr': 7,
 '2': 8,
 'p': 9,
 'list': 10,
 'size': 11,
 'color': 12,
 'b': 13,
 'br': 14,
 '3d': 15,
 'td': 16,
 'font': 17,
 '1': 18}

In [41]:
import numpy as np

def message_to_count_vector(message):
    count_vector = np.zeros(len(features))
    
    processed_list_of_tokens = message_to_token_list(message)
    for token in processed_list_of_tokens:
        if token not in features:
            continue
        index = token_to_index_mapping[token]
        count_vector[index] += 1
    return count_vector

In [42]:
def df_to_X_y(dff):
    y = dff['CATEGORY'].to_numpy().astype(int)
    
    message_col = dff['MESSAGE']
    count_vectors = []
    for message in message_col:
        count_vector = message_to_count_vector(message)
        count_vectors.append(count_vector)
    X = np.array(count_vectors).astype(int)
    
    return X, y
        

In [43]:
X_train, y_train = df_to_X_y(train_df)
X_test, y_test = df_to_X_y(test_df)


In [44]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)
X_train


array([[0.        , 0.02369668, 0.00580271, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00363636],
       [0.        , 0.        , 0.00193424, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.00773694, ..., 0.        , 0.        ,
        0.00727273],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.07109005, 0.05415861, ..., 0.20875421, 0.02028273,
        0.05090909]])

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88       798
           1       1.00      0.37      0.54       362

    accuracy                           0.80      1160
   macro avg       0.89      0.69      0.71      1160
weighted avg       0.85      0.80      0.77      1160



In [46]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       798
           1       0.88      0.74      0.81       362

    accuracy                           0.89      1160
   macro avg       0.89      0.85      0.86      1160
weighted avg       0.89      0.89      0.89      1160

