#  Problem: Binary Classification for Email Spam Detection

Given a dataset of emails with labeled spam or non-spam, use a perceptron to classify whether a new email is spam or not.

In [1]:
# Import Libraries
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('emails.csv')
dataset.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [3]:
# Statistical Analysis and data cleaning

df = dataset.copy()
df.shape

(5172, 3002)

In [4]:
df.isnull().count()

Email No.     5172
the           5172
to            5172
ect           5172
and           5172
              ... 
military      5172
allowing      5172
ff            5172
dry           5172
Prediction    5172
Length: 3002, dtype: int64

In [5]:
# Dropping null values
df = df.dropna()

In [6]:
df.info()
df.iloc[: , 2:3001]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


Unnamed: 0,to,ect,and,for,of,a,you,hou,in,on,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0,1,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,13,24,6,6,2,102,1,27,18,21,...,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,8,0,0,4,2,...,0,0,0,0,0,0,0,0,0,0
3,5,22,0,5,1,51,2,10,1,5,...,0,0,0,0,0,0,0,0,0,0
4,6,17,1,5,2,57,0,9,3,12,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,3,0,0,32,0,0,5,6,...,0,0,0,0,0,0,0,0,0,0
5168,27,11,2,6,5,151,4,3,23,18,...,0,0,0,0,0,0,0,0,1,0
5169,0,1,1,0,0,11,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
5170,7,1,0,2,1,28,2,0,8,11,...,0,0,0,0,0,0,0,0,1,0


In [7]:
# Splliting into training and testing

from sklearn.model_selection import train_test_split

features = df.iloc[ : , 1 : 3001]
target = df[['Prediction']]

In [8]:
features.head()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,0,1,0


In [9]:
target.head()

Unnamed: 0,Prediction
0,0
1,0
2,0
3,0
4,0


In [10]:
X = features
X.head()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,0,1,0


In [11]:
Y = target
Y.head()

Unnamed: 0,Prediction
0,0
1,0
2,0
3,0
4,0


In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size = 0.8, test_size = 0.2, random_state = 42)

In [13]:
# Apply perceptron

from sklearn.linear_model import Perceptron

perceptron = Perceptron()
perceptron.fit(X_train,Y_train)
y_pred = perceptron.predict(X_test)
y_pred = pd.DataFrame(y_pred)

  y = column_or_1d(y, warn=True)


In [14]:
# finding accuracy:
from sklearn import metrics
mse = metrics.mean_squared_error(Y_test,y_pred)
print(f'Mean Squared Error: {mse}')
accuracy_score = metrics.accuracy_score(Y_test,y_pred)
print(f'Accuracy: {accuracy_score}')

Mean Squared Error: 0.0966183574879227
Accuracy: 0.9033816425120773


In [15]:
Y_test.shape

(1035, 1)

In [16]:
y_pred.shape

(1035, 1)

In [17]:
Y_test

Unnamed: 0,Prediction
1566,0
1988,0
1235,1
3276,0
3438,0
...,...
1175,0
2594,0
3377,0
5065,1


In [18]:
y_pred

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,0
...,...
1030,1
1031,1
1032,0
1033,1
