In [205]:
import numpy as np
import pandas as pd

## Loading the dataset checking the shape.

In [206]:
df: pd.DataFrame = pd.read_csv("Dataset/emails.csv")
# printing the shape of the dataset
print(df.shape)

(5730, 110)


## Checking heads and tails for 10 rows of dataset.

In [207]:
# Viewing the first 10 rows of the dataset.
df.head(10)

Unnamed: 0,text,spam,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109
0,Subject: naturally irresistible your corporate...,YES,,,,,,,,,...,,,,,,,,,,
1,Subject: the stock trading gunslinger fanny i...,YES,,,,,,,,,...,,,,,,,,,,
2,Subject: unbelievable new homes made easy im ...,YES,,,,,,,,,...,,,,,,,,,,
3,Subject: 4 color printing special request add...,YES,,,,,,,,,...,,,,,,,,,,
4,"Subject: do not have money , get software cds ...",YES,,,,,,,,,...,,,,,,,,,,
5,"Subject: great nnews hello , welcome to medzo...",YES,,,,,,,,,...,,,,,,,,,,
6,Subject: here ' s a hot play in motion homela...,YES,,,,,,,,,...,,,,,,,,,,
7,Subject: save your money buy getting this thin...,YES,,,,,,,,,...,,,,,,,,,,
8,Subject: undeliverable : home based business f...,YES,,,,,,,,,...,,,,,,,,,,
9,Subject: save your money buy getting this thin...,YES,,,,,,,,,...,,,,,,,,,,


In [208]:
# Viewing the last 10 rows of the datasets.
df.tail(10)

Unnamed: 0,text,spam,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109
5720,"Subject: altos na gas model kim , i know you ...",NO,,,,,,,,,...,,,,,,,,,,
5721,Subject: power market research i came across ...,NO,,,,,,,,,...,,,,,,,,,,
5722,Subject: re : visit to houston fyi - - - - -...,NO,,,,,,,,,...,,,,,,,,,,
5723,Subject: ees risk management presentations for...,NO,,,,,,,,,...,,,,,,,,,,
5724,Subject: re : vacation vince : i just found ...,NO,,,,,,,,,...,,,,,,,,,,
5725,Subject: re : research and development charges...,NO,,,,,,,,,...,,,,,,,,,,
5726,"Subject: re : receipts from visit jim , than...",NO,,,,,,,,,...,,,,,,,,,,
5727,Subject: re : enron case study update wow ! a...,NO,,,,,,,,,...,,,,,,,,,,
5728,"Subject: re : interest david , please , call...",NO,,,,,,,,,...,,,,,,,,,,
5729,Subject: news : aurora 5 . 2 update aurora ve...,NO,,,,,,,,,...,,,,,,,,,,


## Data preprocessing

In [209]:
# Checking for null values.
print(df.isnull().sum())

text               0
spam               2
Unnamed: 2      5728
Unnamed: 3      5728
Unnamed: 4      5728
                ... 
Unnamed: 105    5728
Unnamed: 106    5728
Unnamed: 107    5729
Unnamed: 108    5729
Unnamed: 109    5729
Length: 110, dtype: int64


<font size="6">A lot of Unnamed **null/NaN** columns. Need to get rid of them.</font>

In [210]:
# Getting rid of these unnecessary null columns.
df = df.iloc[:, :2]

In [211]:
# Checking the current shape of data frame
print(df.shape)
df.head(10)

(5730, 2)


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,YES
1,Subject: the stock trading gunslinger fanny i...,YES
2,Subject: unbelievable new homes made easy im ...,YES
3,Subject: 4 color printing special request add...,YES
4,"Subject: do not have money , get software cds ...",YES
5,"Subject: great nnews hello , welcome to medzo...",YES
6,Subject: here ' s a hot play in motion homela...,YES
7,Subject: save your money buy getting this thin...,YES
8,Subject: undeliverable : home based business f...,YES
9,Subject: save your money buy getting this thin...,YES


 <font size="6">Got rid of unnecessary null columns. However, `spam` column is not yet encoded. Need to encode it.</font>

In [212]:
# Will replace `YES` -> 1 and `NO` -> 0
# But first need to check if how many **unique** values are there.
# Depending on that, will need to choose the encoding type.
df['spam'].unique()

array(['YES', 'NO', nan,
       ' its termination would not  have such a phenomenal impact on the power situation .  however ',
       ' mr suresh prabhu '], dtype=object)

In [213]:
# Need to get rid of NaN and unnecessary things.
print(df.isnull().sum())

text    0
spam    2
dtype: int64


In [214]:
df.dropna(inplace=True)
print(df.shape)
df.isnull().sum()

(5728, 2)


text    0
spam    0
dtype: int64

In [215]:
# Got rid of NaN/null values now it's time to get rid of unnecessary values.
index_of_unnecessary_values: np.ndarray = df[(df['spam'] != 'YES') & (df['spam'] != 'NO')].index.values
# Got the index. Now it's time to drop them.
df.drop(index_of_unnecessary_values, inplace=True)
# Checking the checking if this worked.
df['spam'].unique()

array(['YES', 'NO'], dtype=object)

<font size="6"> only two values! **Binary** categorical encoding </font>

In [216]:
# Using map() to make changes instead of using sk learn library.
df['spam'] = df['spam'].map({'YES': 1, 'NO': 0})
print(df.head(10))
print(df.tail(10))

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1
5  Subject: great nnews  hello , welcome to medzo...     1
6  Subject: here ' s a hot play in motion  homela...     1
7  Subject: save your money buy getting this thin...     1
8  Subject: undeliverable : home based business f...     1
9  Subject: save your money buy getting this thin...     1
                                                   text  spam
5720  Subject: altos na gas model  kim , i know you ...     0
5721  Subject: power market research  i came across ...     0
5722  Subject: re : visit to houston  fyi  - - - - -...     0
5723  Subject: ees risk management presentations for...     0
5724  Subject: re : vacation  vince :  i 

In [217]:
# Now, need to check for the duplicate values and get rid of them as well.
df.duplicated().values.__contains__(False)

True


<font size="6">As the data set contains **duplicate** values. Need to get rid of them</font>

In [219]:
# It contains duplicate values, so need to get rid of them
df.drop_duplicates(inplace=True)
df.duplicated().values.__contains__(True)

False