In [1]:
import pandas as pd
from sklearn import cross_validation, ensemble, metrics

## Read spambase data file 

In [2]:
spambase = pd.read_csv('spambase.data', header = None)

In [3]:
print("Number of Instances: ", spambase.shape[0])
print("Number of Attributes: ", spambase.shape[1])

('Number of Instances: ', 4601)
('Number of Attributes: ', 58)


In [4]:
spambase.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,0.239413,...,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285,0.394045
std,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,0.644755,...,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851,0.488698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


## Split class data (spam or ham category) from other variables  

In [5]:
X = spambase.iloc[:,:-1]
y = spambase.iloc[:,-1]

In [6]:
print ("Ham: ", y.value_counts()[0])
print ("Spam: ", y.value_counts()[1])

('Ham: ', 2788)
('Spam: ', 1813)


## Split data set into training and test groups 

In [7]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, stratify=y, random_state=8)

## Train using random forest classifier 

In [8]:
rfc = ensemble.RandomForestClassifier(criterion="entropy", random_state=88)
rfc_fit = rfc.fit(X_train, y_train)

## Predict classes for training data set and check performance 

In [11]:
rfc_train = rfc_fit.predict(X_train)
print metrics.classification_report(y_train, rfc_train, labels=[1,0], target_names=["Spam", "Ham"])

             precision    recall  f1-score   support

       Spam       1.00      0.99      1.00      1450
        Ham       1.00      1.00      1.00      2230

avg / total       1.00      1.00      1.00      3680



## Predict classes for test data set and check performance

In [12]:
rfc_test = rfc_fit.predict(X_test)
print metrics.classification_report(y_test, rfc_test, labels=[1,0], target_names=["Spam", "Ham"])

             precision    recall  f1-score   support

       Spam       0.95      0.94      0.95       363
        Ham       0.96      0.97      0.97       558

avg / total       0.96      0.96      0.96       921



## We chose the random forest classifier from the sklearn.ensemble package because it uses the average fit of many decision tree classifiers which improves the accuracy of the prediction and helps with over-fitting.  

## The precision of the training data prediction was higher than that of the test data prediction since the training data was used to create the model, but 96% precision on the test data prediction is pretty good.  