### Making the neccesary imports

In [1]:
import csv
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

### For purpose of this problem, we load the data given as spambase.data into a pandas dataframe

In [2]:
file=pd.read_csv("spambase.data",header=None)

Just In case we need the above file for future computations, we create a copy and actually perform every computation on this file

In [3]:
df=file.copy(deep=True)

#### Let us check our data

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [5]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,0.239413,...,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285,0.394045
std,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,0.644755,...,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851,0.488698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


### We know that our dataset has total 4601 entries for emails and 57 features, and our label is 0/1(Not Spam/Spam)
From the above we can see that we have no missing value

Let's dive deeper to verify for each column if there are any missing values

In [6]:
total = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total_Missing_value', 'Missing_value_%'])
missing_data.head()

Unnamed: 0,Total_Missing_value,Missing_value_%
57,0,0.0
14,0,0.0
26,0,0.0
25,0,0.0
24,0,0.0


### Here the data doesn't have any missing values and it is already pre-processed(No categorical/nominal to numeric conversion required)

In [7]:
len(df.columns)

58

#### Let us check if we can select features for our model and reduce amount of features while not compromisng accuracy by more than 5%
##### For this we use the chi-square test(Generally Used for Categorical Attributes but turns numeric attributes into discrete automatically and peforms feature selection)

In [8]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2_contingency

class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfObserved = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)

        print(result)
        
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX,alpha)


#Initialize ChiSquare Class
cT = ChiSquare(df)
# train_df.columns
# Feature Selection
col=df.columns[:-1]
testColumns = col
for var in testColumns:
    cT.TestIndependence(colX=var,colY=57 ) 

0 is IMPORTANT for Prediction
1 is IMPORTANT for Prediction
2 is IMPORTANT for Prediction
3 is IMPORTANT for Prediction
4 is IMPORTANT for Prediction
5 is IMPORTANT for Prediction
6 is IMPORTANT for Prediction
7 is IMPORTANT for Prediction
8 is IMPORTANT for Prediction
9 is IMPORTANT for Prediction
10 is IMPORTANT for Prediction
11 is IMPORTANT for Prediction
12 is IMPORTANT for Prediction
13 is IMPORTANT for Prediction
14 is IMPORTANT for Prediction
15 is IMPORTANT for Prediction
16 is IMPORTANT for Prediction
17 is IMPORTANT for Prediction
18 is IMPORTANT for Prediction
19 is IMPORTANT for Prediction
20 is IMPORTANT for Prediction
21 is IMPORTANT for Prediction
22 is IMPORTANT for Prediction
23 is IMPORTANT for Prediction
24 is IMPORTANT for Prediction
25 is IMPORTANT for Prediction
26 is IMPORTANT for Prediction
27 is IMPORTANT for Prediction
28 is IMPORTANT for Prediction
29 is IMPORTANT for Prediction
30 is IMPORTANT for Prediction
31 is IMPORTANT for Prediction
32 is NOT an impor

Here there are 8 features which can be successfully dropped (as they won't be affecting the accuracy of our model by significant amount)

In [9]:
train_df=df.drop(32,axis=1)
train_df=train_df.drop(37,axis=1)
train_df=train_df.drop(40,axis=1)
train_df=train_df.drop(41,axis=1)
train_df=train_df.drop(43,axis=1)
train_df=train_df.drop(47,axis=1)
train_df=train_df.drop(50,axis=1)
train_df=train_df.drop(46,axis=1)

In [10]:
train_df.columns

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34,
            35, 36, 38, 39, 42, 44, 45, 48, 49, 51, 52, 53, 54, 55, 56, 57],
           dtype='int64')

##### Now I am checking whether there is correlation between any of the remaining features present, if there is we can safely drop this correlated feature

In [11]:
correlated_features = set()  
correlation_matrix = train_df.corr()  
for i in range(len(correlation_matrix .columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.85:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
(correlated_features)

{33}

In [12]:
train_df=train_df.drop(33,axis=1)

Changing the name of the columns for better inference

In [13]:
train_df.rename(columns=lambda x: "col"+str(x), inplace=True)

In [14]:
train_df.head()

df_train = train_df.drop("col57",axis=1)

y =  train_df["col57"]


### Spliting the data into training and testing data set

In [15]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(df_train,y,test_size=0.25,random_state=0, stratify=y)

#### Now we have pre-processed our data and splitted it into training and testing sets

### Building Clasifiers

### KNN Classifier : 1-Neighbour

In [16]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) 

print("Accuracy  F-pos   F-neg   Error rate")
errors = []

for train,test in kfold.split(df_train,y):
    neigh = KNeighborsClassifier(n_neighbors=1)
    nfit = neigh.fit(X_train, Y_train)
    result1 = nfit.predict(X_test)
    accuracy_score(Y_test, result1)
    neigh = neigh.fit(df_train.iloc[train],y.iloc[train])
    y_pred=neigh.predict(df_train.iloc[test])
    score = accuracy_score(y.iloc[test],y_pred)
    cm = confusion_matrix(y.iloc[test], y_pred)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    errors.append(1-score)
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))

errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))


Accuracy  F-pos   F-neg   Error rate
0.8308   0.1362   0.2198   0.1692
0.8395   0.1183   0.2253   0.1605
0.8113   0.1398   0.2637   0.1887
0.8196   0.1470   0.2320   0.1804
0.8065   0.1792   0.2155   0.1935
0.8348   0.1111   0.2486   0.1652
0.8087   0.1613   0.2376   0.1913
0.8370   0.1326   0.2099   0.1630
0.8627   0.1115   0.1768   0.1373
0.8344   0.1619   0.1713   0.1656
Overall Error rate : 0.1715


### KNN Classifier : 10-Neighbour

In [17]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) 

print("Accuracy  F-pos   F-neg   Error rate")
errors = []

for train,test in kfold.split(df_train,y):
    neigh = KNeighborsClassifier(n_neighbors=10)
    nfit = neigh.fit(X_train, Y_train)
    result1 = nfit.predict(X_test)
    accuracy_score(Y_test, result1)
    neigh = neigh.fit(df_train.iloc[train],y.iloc[train])
    y_pred=neigh.predict(df_train.iloc[test])
    score = accuracy_score(y.iloc[test],y_pred)
    cm = confusion_matrix(y.iloc[test], y_pred)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    errors.append(1-score)
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))

errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))


Accuracy  F-pos   F-neg   Error rate
0.7831   0.1219   0.3626   0.2169
0.7983   0.1075   0.3462   0.2017
0.7939   0.1362   0.3132   0.2061
0.7848   0.1254   0.3536   0.2152
0.7826   0.1470   0.3260   0.2174
0.7761   0.1183   0.3867   0.2239
0.7696   0.1434   0.3646   0.2304
0.7739   0.1290   0.3757   0.2261
0.7996   0.1223   0.3204   0.2004
0.8235   0.1151   0.2707   0.1765
Overall Error rate : 0.2115


### Random Forest classifier

#### Setting the parameters 

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import model_selection

cv_split = model_selection.ShuffleSplit(n_splits = 100, test_size = .2, train_size = .6, random_state = 0)
dtree = tree.DecisionTreeClassifier(random_state = 0)
base_results = model_selection.cross_validate(dtree, X_train, Y_train, cv  = 10)
dtree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

### Confusion Matrix (False -pos, False -neg, Error Rate) for all Folds : Random Forest

In [19]:
from sklearn.model_selection import StratifiedKFold, KFold
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) #Kfold gives overall error of 0.0724

print("Accuracy  F-pos   F-neg   Error rate")
errors = []

for train,test in kfold.split(df_train,y):
    dtree = tree.DecisionTreeClassifier(random_state = 0)
    dtree = dtree.fit(df_train.iloc[train],y.iloc[train])
    y_pred=dtree.predict(df_train.iloc[test])
    score = accuracy_score(y.iloc[test],y_pred)
    cm = confusion_matrix(y.iloc[test], y_pred)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    errors.append(1-score)
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))

errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))

Accuracy  F-pos   F-neg   Error rate
0.9197   0.0609   0.1099   0.0803
0.9219   0.0789   0.0769   0.0781
0.9046   0.0753   0.1264   0.0954
0.9283   0.0645   0.0829   0.0717
0.9065   0.0753   0.1215   0.0935
0.9109   0.0717   0.1160   0.0891
0.9152   0.0466   0.1436   0.0848
0.9152   0.0753   0.0994   0.0848
0.9325   0.0540   0.0884   0.0675
0.8998   0.1079   0.0884   0.1002
Overall Error rate : 0.0845


Here we got 92% accuracy with the decision tress classifier and now let us check crossvalidated scores for each fold to see if we are over-fitting

In [20]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dtree, X_train, Y_train, cv=10)
print ("Crossvalidated scores:", scores)

Crossvalidated scores: [0.8699422  0.88695652 0.91884058 0.91304348 0.92463768 0.89855072
 0.92463768 0.93623188 0.91014493 0.88953488]


### Logistic Regression classifier

In [21]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

clf = clf.fit(X_train,Y_train)
y_pred=clf.predict(X_test)

In [22]:
print('Score :',clf.score(X_test,Y_test),'\n\n')
print('Accuracy score : ', accuracy_score(Y_test,y_pred),'\n\n')
print('Classification report:\n',classification_report(Y_test,y_pred))

Score : 0.9322328410078193 


Accuracy score :  0.9322328410078193 


Classification report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.94       697
           1       0.92      0.91      0.91       454

   micro avg       0.93      0.93      0.93      1151
   macro avg       0.93      0.93      0.93      1151
weighted avg       0.93      0.93      0.93      1151



Here I got an accuracy of 93% for my logistic regression classifier

##### Logistic Regression has better accuracy than above models

### Confusion Matrix (False -pos, False -neg, Error Rate) for all Folds : Logistic Regression Classifier

In [23]:
from sklearn.model_selection import StratifiedKFold, KFold
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) #Kfold gives overall error of 0.0724

print("Accuracy  F-pos   F-neg   Error rate")
errors = []

for train,test in kfold.split(df_train,y):
    clf = LogisticRegression()
    clf = clf.fit(df_train.iloc[train],y.iloc[train])
    y_pred_log=clf.predict(df_train.iloc[test])
    score = accuracy_score(y.iloc[test],y_pred_log)
    cm = confusion_matrix(y.iloc[test], y_pred_log)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    errors.append(1-score)
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))

errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))

Accuracy  F-pos   F-neg   Error rate
0.9197   0.0394   0.1429   0.0803
0.9523   0.0287   0.0769   0.0477
0.9046   0.0609   0.1484   0.0954
0.9065   0.0824   0.1105   0.0935
0.9217   0.0502   0.1215   0.0783
0.9174   0.0645   0.1105   0.0826
0.9196   0.0287   0.1602   0.0804
0.9239   0.0394   0.1326   0.0761
0.9281   0.0432   0.1160   0.0719
0.9237   0.0647   0.0939   0.0763
Overall Error rate : 0.0782


### AdaBoost Classifier

In [24]:
from sklearn.datasets import make_gaussian_quantiles
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

bdt_real = AdaBoostClassifier(DecisionTreeClassifier(random_state = 0),n_estimators=100,learning_rate=0.5)
bdt_real.fit(X_train, Y_train)
real_test_predict = bdt_real.predict(X_test)
accuracy_score(real_test_predict, Y_test)

0.9331016507384883

In [25]:
from sklearn import metrics
scores = cross_val_score(bdt_real, X_train, Y_train, cv=10)
print ("Crossvalidated scores: for ADABOOST", scores)

Crossvalidated scores: for ADABOOST [0.86416185 0.95072464 0.95942029 0.93333333 0.96231884 0.91884058
 0.95072464 0.93913043 0.90434783 0.89825581]


### Confusion Matrix (False -pos, False -neg, Error Rate) for all Folds : AdaBoost classifier

In [26]:
from sklearn.model_selection import StratifiedKFold, KFold
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) #Kfold gives overall error of 0.0724

print("Accuracy  F-pos   F-neg   Error rate")
errors = []


for train,test in kfold.split(df_train,y):
    bdt_real = AdaBoostClassifier(DecisionTreeClassifier(random_state = 0),n_estimators=100,learning_rate=0.5)
    bdt_real = bdt_real.fit(df_train.iloc[train],y.iloc[train])
    y_pred=bdt_real.predict(df_train.iloc[test])
    score = accuracy_score(y.iloc[test],y_pred)
    cm = confusion_matrix(y.iloc[test], y_pred)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    errors.append(1-score)
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))

errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))

Accuracy  F-pos   F-neg   Error rate
0.9436   0.0287   0.0989   0.0564
0.9479   0.0538   0.0495   0.0521
0.9176   0.0466   0.1374   0.0824
0.9283   0.0609   0.0884   0.0717
0.9348   0.0430   0.0994   0.0652
0.9435   0.0394   0.0829   0.0565
0.9478   0.0143   0.1105   0.0522
0.9457   0.0323   0.0884   0.0543
0.9477   0.0288   0.0884   0.0523
0.9216   0.0971   0.0497   0.0784
Overall Error rate : 0.0622



### AdaBoost gave 94% accuracy

### Multi-Layer Perceptron (MLP)

In [27]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
# Don't cheat - fit only on training data
scaler.fit(X_train)  
St_train = scaler.transform(X_train)  
# apply same transformation to test data
St_test = scaler.transform(X_test)

from sklearn.neural_network import MLPClassifier
mlpnn = MLPClassifier(solver='lbfgs', alpha=1e-5,random_state=1)
mlpnn.fit(St_train, Y_train)
mlp_pred = mlpnn.predict(St_test)
accuracy_score(mlp_pred, Y_test)

0.9374456993918332

### Confusion Matrix (False -pos, False -neg, Error Rate) for all Folds : MLP classifier

In [28]:
from sklearn.model_selection import StratifiedKFold, KFold
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0) #Kfold gives overall error of 0.0724

print("Accuracy  F-pos   F-neg   Error rate")
errors = []


for train,test in kfold.split(df_train,y):
    mlpnn = MLPClassifier(solver='lbfgs', alpha=1e-5,random_state=1)
    mlpnn = mlpnn.fit(df_train.iloc[train],y.iloc[train])
    y_pred=mlpnn.predict(df_train.iloc[test])
    score = accuracy_score(y.iloc[test],y_pred)
    cm = confusion_matrix(y.iloc[test], y_pred)
    fraction_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    errors.append(1-score)
    print("{0:.4f}   {1:.4f}   {2:.4f}   {3:.4f}".format(score, fraction_cm[0][1], fraction_cm[1][0], (1-score)))

errors = np.array(errors)
print("Overall Error rate : {0:.4f}".format(errors.mean()))

Accuracy  F-pos   F-neg   Error rate
0.8633   0.0968   0.1978   0.1367
0.8937   0.0860   0.1374   0.1063
0.8286   0.2186   0.0989   0.1714
0.8283   0.1864   0.1492   0.1717
0.6913   0.2115   0.4586   0.3087
0.8587   0.1254   0.1657   0.1413
0.7152   0.0932   0.5801   0.2848
0.8478   0.1183   0.2044   0.1522
0.8540   0.1367   0.1602   0.1460
0.8780   0.1403   0.0939   0.1220
Overall Error rate : 0.1741


## Hence, from above observation we can see that Adaboost, Random Forest and Logistic give us decent accuracies per fold and error rates, while KNN doesn't perform well and MLP Classifier has bad cross-validation scores. I will choose Adaboost as my classifier