In [79]:
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

In [102]:
train_df = pd.read_csv('spambasetrain.csv', header=None)
test_df = pd.read_csv('spambasetest.csv', header=None)
name = {i:'x'+str(i) for i in range(9)}
name[9] = 'y'
train_df.rename(columns=name, inplace=True)
test_df.rename(columns=name, inplace=True)
train_df.head(10)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,y
0,0.0,0.316,0.0,0.0,0.0,0.0,1.125,2,9,0
1,0.0,0.735,0.0,0.735,0.0,0.0,2.571,10,18,0
2,0.0,0.105,0.0,0.0,0.158,0.0,1.494,10,139,1
3,0.0,0.299,0.0,0.0,0.0,0.199,2.465,28,106,1
4,0.0,0.0,0.0,0.817,0.0,0.0,1.857,15,39,0
5,0.0,0.0,0.0,2.272,0.0,0.0,1.75,7,14,0
6,0.055,0.334,0.0,0.055,0.0,0.055,1.685,6,59,0
7,0.0,0.162,0.0,0.0,0.0,0.0,2.643,34,193,0
8,0.0,0.075,0.0,0.613,0.532,0.137,7.3,763,2453,1
9,0.0,0.052,0.0,0.078,0.0,0.235,3.153,121,618,0


In [81]:
def calc_Prob(df):
    p1 = df['y'].mean()
    return 1-p1, p1

In [86]:
p0, p1 = calc_Prob(train_df)
print('The estimated value of P(non_spam) is', p0, 'and of P(spam) is', p1)

The estimated value of P(non_spam) is 0.5981993997999333 and of P(spam) is 0.4018006002000667


In [97]:
def calc_gaussian_params(df, y):
    params = np.zeros((df.shape[1]-1, 2))
    for i, xi in enumerate(df.columns[:9]):
        params[i][0] = df.query('y=='+str(y))[xi].mean()
        params[i][1] = df.query('y=='+str(y))[xi].var()
    return pd.DataFrame(params, columns=['miu', 'sigma^2'])

In [105]:
params_c0 = calc_gaussian_params(train_df, 0)
params_c1 = calc_gaussian_params(train_df, 1)
print('The estimateds (miu, sigma^2) for x_i and non_spam class are:')
params_c0

print('The estimateds (miu, sigma^2) for x_i and spam class are:')
params_c1

The estimateds (miu, sigma^2) for x_i and non_spam class are:


Unnamed: 0,miu,sigma^2
0,0.048426,0.088306
1,0.157487,0.07018
2,0.01696,0.004845
3,0.110713,0.417588
4,0.011503,0.00408
5,0.025892,0.089269
6,2.37659,37.447271
7,18.182274,1915.266813
8,164.143255,144134.168535


The estimateds (miu, sigma^2) for x_i and spam class are:


Unnamed: 0,miu,sigma^2
0,0.020247,0.007542
1,0.103439,0.032143
2,0.007847,0.002416
3,0.491662,0.547792
4,0.173151,0.129976
5,0.084901,0.527283
6,9.496347,2965.984486
7,97.209129,36369.991113
8,453.512863,614182.120474


In [42]:
def gaussian(x, miu, sigma):
    res = np.exp(-(x-miu)**2/(2*sigma))/(np.sqrt(2*np.pi*sigma))
    return res

In [106]:
def calc_label(df, params_c0, params_c1, p0, p1):
    labels = np.zeros((df.shape[0], 1))
    for row,d in df.iterrows():
        likeli_c0, likeli_c1 = 0, 0
        for i in range(9):
            
            likeli_c0 += np.log(gaussian(d['x'+str(i)], params_c0['miu'].iloc[i], params_c0['sigma^2'].iloc[i]))
            likeli_c1 += np.log(gaussian(d['x'+str(i)], params_c1['miu'].iloc[i], params_c1['sigma^2'].iloc[i]))
        
        if np.log(p0)+likeli_c0 > np.log(p1)+likeli_c1:
            labels[row] = 0
        else:
            labels[row] = 1
    return pd.DataFrame(labels, dtype=int, columns=['label'])

In [107]:
labels = calc_label(test_df, params_c0, params_c1, p0, p1)
print('The predicted classes for all the test examples are:')
labels

The predicted classes for all the test examples are:


Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [113]:
test_df['y_']=labels['label']
test_df['res']=(test_df['y']==test_df['y_'])
print('The number of test examples classified correctly is ', sum(test_df['res']))
print('The number of test examples classified incorrectly is ', test_df.shape[0]-sum(test_df['res']))
print('The percentage error on the test examples is ', (test_df.shape[0]-sum(test_df['res']))/test_df.shape[0])

The number of test examples classified correctly is  160
The number of test examples classified incorrectly is  40
The percentage error on the test examples is  0.2


In [121]:
print('The accuracy that if we use Zero-R is', max(sum(test_df['y']), test_df.shape[0]-sum(test_df['y']))/test_df.shape[0])

The accuracy that if we use Zero-R is 0.59
