In [283]:
import numpy as np
import pandas as pd

## Loading the dataset checking the shape.

In [284]:
df: pd.DataFrame = pd.read_csv("Dataset/emails.csv")
# printing the shape of the dataset
print(df.shape)

(5730, 110)


## Checking heads and tails for 10 rows of dataset.

In [285]:
# Viewing the first 10 rows of the dataset.
df.head(10)

Unnamed: 0,text,spam,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109
0,Subject: naturally irresistible your corporate...,YES,,,,,,,,,...,,,,,,,,,,
1,Subject: the stock trading gunslinger fanny i...,YES,,,,,,,,,...,,,,,,,,,,
2,Subject: unbelievable new homes made easy im ...,YES,,,,,,,,,...,,,,,,,,,,
3,Subject: 4 color printing special request add...,YES,,,,,,,,,...,,,,,,,,,,
4,"Subject: do not have money , get software cds ...",YES,,,,,,,,,...,,,,,,,,,,
5,"Subject: great nnews hello , welcome to medzo...",YES,,,,,,,,,...,,,,,,,,,,
6,Subject: here ' s a hot play in motion homela...,YES,,,,,,,,,...,,,,,,,,,,
7,Subject: save your money buy getting this thin...,YES,,,,,,,,,...,,,,,,,,,,
8,Subject: undeliverable : home based business f...,YES,,,,,,,,,...,,,,,,,,,,
9,Subject: save your money buy getting this thin...,YES,,,,,,,,,...,,,,,,,,,,


In [286]:
# Viewing the last 10 rows of the datasets.
df.tail(10)

Unnamed: 0,text,spam,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109
5720,"Subject: altos na gas model kim , i know you ...",NO,,,,,,,,,...,,,,,,,,,,
5721,Subject: power market research i came across ...,NO,,,,,,,,,...,,,,,,,,,,
5722,Subject: re : visit to houston fyi - - - - -...,NO,,,,,,,,,...,,,,,,,,,,
5723,Subject: ees risk management presentations for...,NO,,,,,,,,,...,,,,,,,,,,
5724,Subject: re : vacation vince : i just found ...,NO,,,,,,,,,...,,,,,,,,,,
5725,Subject: re : research and development charges...,NO,,,,,,,,,...,,,,,,,,,,
5726,"Subject: re : receipts from visit jim , than...",NO,,,,,,,,,...,,,,,,,,,,
5727,Subject: re : enron case study update wow ! a...,NO,,,,,,,,,...,,,,,,,,,,
5728,"Subject: re : interest david , please , call...",NO,,,,,,,,,...,,,,,,,,,,
5729,Subject: news : aurora 5 . 2 update aurora ve...,NO,,,,,,,,,...,,,,,,,,,,


## Data preprocessing

In [287]:
# Checking for null values.
print(df.isnull().sum())

text               0
spam               2
Unnamed: 2      5728
Unnamed: 3      5728
Unnamed: 4      5728
                ... 
Unnamed: 105    5728
Unnamed: 106    5728
Unnamed: 107    5729
Unnamed: 108    5729
Unnamed: 109    5729
Length: 110, dtype: int64


<font size="6">A lot of Unnamed **null/NaN** columns. Need to get rid of them.</font>

In [288]:
# Getting rid of these unnecessary null columns.
df = df.iloc[:, :2]

In [289]:
# Checking the current shape of data frame
print(df.shape)
df.head(10)

(5730, 2)


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,YES
1,Subject: the stock trading gunslinger fanny i...,YES
2,Subject: unbelievable new homes made easy im ...,YES
3,Subject: 4 color printing special request add...,YES
4,"Subject: do not have money , get software cds ...",YES
5,"Subject: great nnews hello , welcome to medzo...",YES
6,Subject: here ' s a hot play in motion homela...,YES
7,Subject: save your money buy getting this thin...,YES
8,Subject: undeliverable : home based business f...,YES
9,Subject: save your money buy getting this thin...,YES


 <font size="6">Got rid of unnecessary null columns.However, `spam` column is not yet encoded. Need to encode it.</font>

In [290]:
# Will replace `YES` -> 1 and `NO` -> 0
# But first need to check if how many **unique** values are there.
# Depending on that, will need to choose the encoding type.
df['spam'].unique()

array(['YES', 'NO', nan,
       ' its termination would not  have such a phenomenal impact on the power situation .  however ',
       ' mr suresh prabhu '], dtype=object)

In [291]:
# Need to get rid of NaN and unnecessary things.
print(df.isnull().sum())

text    0
spam    2
dtype: int64


In [292]:
df.dropna(inplace=True)
print(df.shape)
df.isnull().sum()

(5728, 2)


text    0
spam    0
dtype: int64

In [293]:
# Got rid of NaN/null values now it's time to get rid of unnecessary values.
index_of_unnecessary_values: np.ndarray = df[(df['spam'] != 'YES') & (df['spam'] != 'NO')].index.values
# Got the index. Now it's time to drop them.
df.drop(index_of_unnecessary_values, inplace=True)
# Checking the checking if this worked.
df['spam'].unique()

array(['YES', 'NO'], dtype=object)

<font size="6"> only two values! **Binary** categorical encoding </font>

In [294]:
# Using map() to make changes instead of using sk learn library.
df['spam'] = df['spam'].map({'YES': 1, 'NO': 0})
print(df.head(10))
print(df.tail(10))

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1
5  Subject: great nnews  hello , welcome to medzo...     1
6  Subject: here ' s a hot play in motion  homela...     1
7  Subject: save your money buy getting this thin...     1
8  Subject: undeliverable : home based business f...     1
9  Subject: save your money buy getting this thin...     1
                                                   text  spam
5720  Subject: altos na gas model  kim , i know you ...     0
5721  Subject: power market research  i came across ...     0
5722  Subject: re : visit to houston  fyi  - - - - -...     0
5723  Subject: ees risk management presentations for...     0
5724  Subject: re : vacation  vince :  i 

In [295]:
# Now, need to check for the duplicate values and get rid of them as well.
df.duplicated().values.__contains__(False)

True

<font size="6">As the data set contains **duplicate** values. Need to get rid of them</font>

In [296]:
# It contains duplicate values, so need to get rid of them
df.drop_duplicates(inplace=True)
# Testing if any more duplicate values.
df.duplicated().values.__contains__(True)

False

<font size="6">It's time to tokenize the text!
First will try it with CountVectorizer and then with TfidVectorizer and then comparing the accuracies.
Later will try with different models and check as well!</font>

In [297]:
from typing import TypeVar
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

matrix = TypeVar('matrix')  # Defining type for x
pd_Series = TypeVar('pd_Series')
# Converting into a vector of word count.
x: matrix = CountVectorizer().fit_transform(df['text'])
y: pd_Series = df['spam']
print(x)
print(x.toarray())

  (0, 32261)	1
  (0, 23314)	1
  (0, 18781)	1
  (0, 36899)	7
  (0, 10021)	1
  (0, 17635)	1
  (0, 21087)	1
  (0, 18791)	3
  (0, 27928)	1
  (0, 16609)	1
  (0, 33682)	4
  (0, 28052)	1
  (0, 9259)	3
  (0, 33336)	5
  (0, 21601)	2
  (0, 15369)	1
  (0, 24203)	4
  (0, 32525)	1
  (0, 4822)	5
  (0, 18178)	1
  (0, 18827)	1
  (0, 7527)	2
  (0, 16026)	2
  (0, 8024)	1
  (0, 20900)	3
  :	:
  (5692, 24779)	2
  (5692, 21571)	1
  (5692, 5701)	9
  (5692, 30865)	1
  (5692, 2799)	3
  (5692, 13286)	1
  (5692, 13075)	1
  (5692, 17328)	1
  (5692, 14076)	1
  (5692, 20217)	1
  (5692, 31754)	1
  (5692, 13076)	1
  (5692, 20411)	1
  (5692, 35214)	1
  (5692, 8593)	1
  (5692, 30019)	1
  (5692, 13469)	5
  (5692, 36118)	1
  (5692, 939)	2
  (5692, 2768)	1
  (5692, 30216)	1
  (5692, 17529)	1
  (5692, 33853)	1
  (5692, 10328)	1
  (5692, 11336)	1
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [298]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#Split the data into 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(x_train, y_train)
pred = Spam_model.predict(x_test)
accuracy_score(y_test, pred)

0.9885864793678666

In [299]:
# Do it again with TfidfVectorizer

x: matrix = TfidfVectorizer().fit_transform(df['text'])
print(x)
print(x.toarray())

  (0, 18484)	0.05065361378044374
  (0, 26335)	0.07326325045246485
  (0, 24728)	0.03169809527667246
  (0, 5553)	0.027172568615857276
  (0, 20946)	0.04488162569698162
  (0, 16708)	0.02400981857764169
  (0, 9051)	0.08785588350130605
  (0, 33442)	0.02436760586894425
  (0, 28820)	0.07095093846641097
  (0, 21035)	0.07779396393379931
  (0, 32540)	0.10265887666677022
  (0, 14966)	0.020094268162248623
  (0, 14472)	0.08613196468782616
  (0, 14143)	0.08049521650838198
  (0, 23726)	0.04332977347771834
  (0, 36362)	0.02455920586912909
  (0, 8314)	0.062211869131624666
  (0, 4742)	0.06949561036311931
  (0, 34728)	0.08660570366581676
  (0, 27172)	0.05304587337998871
  (0, 16334)	0.07849937238737897
  (0, 29796)	0.08590029521223708
  (0, 342)	0.06509480920340016
  (0, 7379)	0.08204045463823473
  (0, 15523)	0.09760530981107943
  :	:
  (5692, 36362)	0.011458318778278548
  (5692, 33500)	0.023609653851220134
  (5692, 36886)	0.009262329316446844
  (5692, 18884)	0.025253079194168176
  (5692, 8311)	0.02892966

In [300]:
#Split the data into 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Again, checking the accuracy to see if vectorization strategy has any effect.
Spam_model = LogisticRegression(solver='liblinear', penalty='l1')
Spam_model.fit(x_train, y_train)
pred = Spam_model.predict(x_test)
accuracy_score(y_test, pred)


0.9648814749780509

<font size="6">It seems like that there is **some effect** but not a huge effect.</font>


In [None]:
# Todo: Check with another model. Maybe MultinomialNB, Binomial etc.