Initial Setup:

In [21]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from matplotlib.colors import ListedColormap
from sklearn.metrics import precision_score, recall_score, plot_confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics

Read Data:

In [22]:
data_path1 = "../Data/raw/spam.csv"
df_1 = pd.read_csv(data_path1, encoding = "latin-1")

df_1.info()
df_1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
data_path2 = "../Data/raw/enron_spam_data.csv"
df_2 = pd.read_csv(data_path2, encoding = "latin-1")

df_2.info()
df_2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33716 entries, 0 to 33715
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Message ID  33716 non-null  int64 
 1   Subject     33427 non-null  object
 2   Message     33345 non-null  object
 3   Spam/Ham    33716 non-null  object
 4   Date        33716 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14


**Data Cleaning Step 1.1:**

Change label encoding to:

0 - ham

1 - spam

In [24]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df_1['label'] = encoder.fit_transform(df_1['label'])
df_1.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Data Cleaning Step 1.2:

Null values analysis

In [25]:
# missing values
df_1.isnull().sum()

label    0
text     0
dtype: int64

**Data Cleaning Step 1.3: **

Check for duplicates and remove them

In [26]:
# check for duplicate values and delete them
print("delete ", df_1.duplicated().sum(), "duplicates...")
df_1 = df_1.drop_duplicates(keep='first')
print("Remaining duplicates: ",df_1.duplicated().sum())

delete  415 duplicates...
Remaining duplicates:  0


**Data Cleaning Step 2.1: **

In [27]:
#Drop rows with null elements
df_2.dropna(inplace=True)
df_2.info()
df_2.shape

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33107 entries, 1 to 33715
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Message ID  33107 non-null  int64 
 1   Subject     33107 non-null  object
 2   Message     33107 non-null  object
 3   Spam/Ham    33107 non-null  object
 4   Date        33107 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.5+ MB


(33107, 5)

In [28]:
df_2.head()

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
5,5,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham,1999-12-14


In [29]:
 # drop last 2 cols
df_2.drop(columns=['Message ID','Date'],inplace=True)
df_2.sample(5)

Unnamed: 0,Subject,Message,Spam/Ham
26931,bacteria / virus makes your body weak,the antidote is a unique anti - microbial pept...,spam
28143,year end 2000 performance feedback,note : you will receive this message each time...,ham
7113,re : hi :,"zeigham ,\nmike roberts from my group will hel...",ham
1812,"enron / hpl actuals for october 12 , 2000 - re...",all gas nominated at iferc pricing for the mon...,ham
1737,meter 9707,daren - meter 9707 has flow for oct . 1 ( 1 . ...,ham


In [30]:
# renaming the cols
df_2.rename(columns={'Subject':'sub','Message':'msg','Spam/Ham':'label'},inplace=True)
df_2.sample(5)

Unnamed: 0,sub,msg,label
10083,"perfect logo charset = koi 8 - r "" >",thinking of breathing new life into your busin...,spam
28009,tw weekly 9 - 22 - 00,please see attached file . call me at ( 281 ) ...,ham
15188,bait @ em . ca security alert - spyware infect...,bait h ' s erase oslo ammunition attempt lesbi...,spam
24919,introducing hgh : the most powerful anti - obe...,"hello , jm @ example . comhuone therapy\nlose ...",spam
6783,telephone interview with the houston research ...,good morning quentin :\nvince kaminski and the...,ham


In [31]:
df_2['label'] = encoder.fit_transform(df_2['label'])
df_2.sample(5)

Unnamed: 0,sub,msg,label
30814,looking for cheap high - quality software ? pi...,architects devoid perished tutoring\nimplement...,1
7233,interview with the enron research group,good morning mr . giancola :\nyour resume was ...,0
26332,you were accepted . here is your money,"dear applicant , after further review upon rec...",1
16586,"fwd : californians for renewable energy , inc ...","al . , elol - 65\n- - - - - - - - - - - - - - ...",0
5298,re : congratulations,right back at you . . . . . great job,0


In [20]:
#search dubs
print("found ", df_2.duplicated().sum(), "dubs")
df_2 = df_2.drop_duplicates(keep='first')
print("dubs remain: ",df_2.duplicated().sum())

found  3071 dubs
dubs remain:  0


In [33]:
#merge sub + msg

df_2['text'] = df_2['sub'] + " " +  df_2['msg']
df_2.drop(columns=['msg','sub'],inplace=True)
df_2.sample(5)

Unnamed: 0,label,text
15717,1,"stock market standouts hartman ,\nvcsc - brand..."
2250,0,tenaska iv pricing i think we need to remove s...
18117,1,new : b . a . / degree / diploma courses nifdl...
3467,0,new enrononline functionality the following ad...
19291,1,looking for a good match to take out tonight ?...


In [34]:
df_1.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Merging the two datasets:

In [35]:
df = pd.concat([df_1, df_2])
df.shape

(38264, 2)

# Data preprocessing:

The following function applies the following preprocessing steps to a text:

*   lowecase
*   tokenization (using nltk word tokenizer)
*   removing special characters and punctuation (actually all characters except numbers and letters)
*   removing stopwords (using nltk stopword list)
*   normalizing words using lematization (using nltk WordNetLemmatizer)

In [36]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

#from nltk.stem.porter import PorterStemmer
#ps = PorterStemmer()

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import string

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    #retaining only numbers and alphabets
    y = []
    for i in text:
        if i.isalpha():
            y.append(i)

    #clone copy of list y
    text = y[:]
    y.clear()

    #stopwords removal
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    #leammtization
    for i in text:
        #y.append(ps.stem(i))
        y.append(lemmatizer.lemmatize(i))

    return " ".join(y)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elsandner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/elsandner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [37]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,label,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts may...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though


In [38]:
df.drop(columns=['text'],inplace=True)
df.head()

Unnamed: 0,label,transformed_text
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts may...
3,0,u dun say early hor u c already say
4,0,nah think go usf life around though


In [40]:
df.rename(columns={'transformed_text':'text'},inplace=True)
df.head()

Unnamed: 0,label,text
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts may...
3,0,u dun say early hor u c already say
4,0,nah think go usf life around though


In [47]:
df.to_csv("../Data/preprocessed/merged_cleaned_preprocessed.csv", index=False)