In [None]:
#import the libraries
import re
import seaborn as sns
import numpy.random as rnd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sklearn import preprocessing
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
%matplotlib inline

In [None]:
df=pd.read_csv('spambase 2.csv',index_col=0)

In [None]:
df.head()

In [None]:
# Giving column names for the spambase dataset
column_names = [
    'word_freq_make',
    'word_freq_address',
    'word_freq_all',
    'word_freq_3d',
    'word_freq_our',
    'word_freq_over',
    'word_freq_remove',
    'word_freq_internet',
    'word_freq_order',
    'word_freq_mail',
    'word_freq_receive',
    'word_freq_will',
    'word_freq_people',
    'word_freq_report',
    'word_freq_addresses',
    'word_freq_free',
    'word_freq_business',
    'word_freq_email',
    'word_freq_you',
    'word_freq_credit',
    'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money',
    'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650',
    'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
    'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology',
    'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
    'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project',
    'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference',
    'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$',
    'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest',
    'capital_run_length_total', 'is_spam'
]

#Load Dataset
df=pd.read_csv('spambase 2.csv',header=None,names=column_names)

# Print the first few rows of the dataset
df.head()

In [None]:
#Read data in the dataset
df.head()

In [None]:
df.info()

In [None]:
#Check for null values
df.isnull().sum()

In [None]:
df.describe()

In [None]:
#Drop duplicate values
data = df.drop_duplicates()

In [None]:
data.head()

In [None]:
#Plot the feature corelation graph
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), cmap='PuBuGn')
plt.title('Feature Correlation', color='r')
plt.show()

In [None]:
#Plot the outliers table
plt.figure(figsize = (14,6))
chart = df.boxplot()
chart.set_xticklabels(chart.get_xticklabels(), rotation = 80)
plt.title("Spam message outlier", color ='red')
plt.xlabel('Attributes')
plt.ylabel('Count')
plt.show()

In [None]:
#Check for spam and no spam count
df['is_spam'].value_counts()

In [None]:
#Set 0 as no spam and spam as emails as 1
df.is_spam[df['is_spam'] == 1] ='spam'
df.is_spam[df['is_spam'] == 0] ='No spam'
df.head(10)

In [None]:
#Using Inter quartile range method setting the outliers as null values
for outlier in ["capital_run_length_average", "capital_run_length_longest", "capital_run_length_total"]:
    quart75, quart25 = np.percentile(data.loc[:,outlier], [75,25])
    IQR = quart75 - quart25
    max = quart75 + (1.5 * IQR)
    min = quart25 - (1.5 * IQR)
    data.loc[data[outlier] < min, outlier] = np.nan
    data.loc[data[outlier] > max, outlier] = np.nan

In [None]:
#Drop null values in the dataset
newData = data.dropna()
newData = data.dropna(axis=0)

In [None]:
#reset the index values after removing the null values
newData = data.dropna().reset_index(drop=True)

In [None]:
spamData = newData
spamData

In [None]:
#Applying feature scaling technique
scaler=preprocessing.MinMaxScaler(feature_range=(0,1))
scaledData=scaler.fit_transform(spamData)

In [None]:
#Normalization of data
spamScale=pd.DataFrame(data=scaledData, index=spamData.index, columns=spamData.columns)

In [None]:
spamData.describe()

In [None]:
#Apllying PCA technique
pca = PCA(n_components=3)
pcaComp = pca.fit_transform(scaledData)

In [None]:
PCAComp = pd.DataFrame(data = pcaComp, columns = ['PCA1', 'PCA2','PCA3'])

In [None]:
#Display data columns in the dataset where PCA was applied
PCAComp.head()

In [None]:
spamData.head()

In [None]:
#Split data as train and test
X = spamData.drop('is_spam', axis = 1)
y = spamData['is_spam']
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size = 0.2)

In [None]:
#Feature extraction
XTrain = pca.fit_transform(XTrain)
XTest = pca.fit_transform(XTest)

In [None]:
#Store train, test in arrays
neighbors = np.arange(1,9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

In [None]:
#Building the KNN model
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors = 3)
    knn.fit(XTrain, yTrain)
    train_accuracy[i] = knn.score(XTrain, yTrain)
    test_accuracy[i] = knn.score(XTest, yTest)

In [None]:
#Building the Decision Tree model
DT = DecisionTreeClassifier(criterion='gini', max_depth=3)

In [None]:
#Prediction
yPrediction = knn.predict(XTest)

In [None]:
#Confusionn matrix
matrix = pd.DataFrame((metrics.confusion_matrix(yTest, yPrediction)), ('Spam', 'Not spam'), ('Spam', 'Not spam'))
print(matrix)

In [None]:
#Plot confusionn matrix in a graph
heatmap = sns.heatmap(matrix, annot = True, fmt = 'd', cmap = 'Spectral')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels())
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels())

plt.title('Confusion Matrix', color = 'darkblue')
plt.ylabel('True count')
plt.xlabel('Predicted count')
plt.show()

In [None]:
#KNN model
model = [KNeighborsClassifier(n_neighbors = 5)]

In [None]:
#Report model
for knn in model:
    knn.fit(XTrain, yTrain)
    name = knn.__class__, __name__
    trainPred = knn.predict(XTest)
    accuracy = metrics.accuracy_score(yTest, trainPred)

    print("***KNN accuracy***: {:.4%}".format(accuracy))

In [None]:
DT.fit(XTrain, yTrain)
print('Decision Tree accuarcy :', DT.score(XTest, yTest))