# Machine Learning - Naive bayes classifier - Limpography data set

### Testing different parameters on the laplace smoothing

In [459]:
import pandas as pd;
import numpy as np;
import matplotlib.pyplot as plt;
import seaborn as sns 
import sklearn 
%matplotlib inline

import scipy as sc 
import statsmodels.api as sm
from collections import Counter


## 1) Load and preprocess data


In [488]:
x = pd.read_csv('datasets/lymphography.data', header=None)
y = x[0]
y = y.tolist()
print("full data head(2): \n {}".format(x.head(2)))
x.drop(x.columns[0], 1, inplace=True)

print("x head(2): \n {}".format(x.head(2)))
print("y[0:2]: \n {}".format(y[1]))

full data head(2): 
    0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18
0   3   4   2   1   1   1   1   1   2   1   2   2   2   4   8   1   1   2   2
1   2   3   2   1   1   2   2   1   2   1   3   3   2   3   4   2   2   2   2
x head(2): 
    1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18
0   4   2   1   1   1   1   1   2   1   2   2   2   4   8   1   1   2   2
1   3   2   1   1   2   2   1   2   1   3   3   2   3   4   2   2   2   2
y[0:2]: 
 2


## 2) Train Model

In [510]:
# i.a) Split data:

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

print(x_train.head(2))
print(y_train[0:2])

print(x_test.head(2))
print(y_test[0:2])

print(set(x_train[1]))

     1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18
11    4   2   1   1   1   2   1   1   1   3   3   4   3   8   3   2   2   2
127   2   2   1   1   1   2   1   2   1   2   2   3   3   4   2   1   2   1
[2, 2]
     1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18
51    3   2   1   1   1   2   1   2   1   2   2   2   4   8   3   1   2   3
124   4   2   1   1   1   2   1   1   1   2   2   3   3   5   2   1   2   1
[3, 2]
{1, 2, 3, 4}


In [511]:
# b) training discretized 

#i) compute priors:
label_counter = Counter(y)
print(label_counter)

metastases, malign_lymph, fibrosis, normal_find = label_counter.most_common()

n_meta = metastases[1]
n_malign = malign_lymph[1]
n_fib = fibrosis[1]
n_normal = normal_find[1]

print(n_metastases)
print(n_malign_lymph)
print(n_fibrosis)
print(n_normal_find)


p_meta = metastases[1]/(metastases[1]+malign_lymph[1]+fibrosis[1]+normal_find[1])
p_malign = malign_lymph[1]/(metastases[1]+malign_lymph[1]+fibrosis[1]+normal_find[1])
p_fib = fibrosis[1]/(metastases[1]+malign_lymph[1]+fibrosis[1]+normal_find[1])
p_normal = normal_find[1]/(metastases[1]+malign_lymph[1]+fibrosis[1]+normal_find[1])


Counter({2: 81, 3: 61, 4: 4, 1: 2})
81
61
4
2


In [512]:
#ii) compute conditional probailities with laplace smoothing 

# a) make data structure
bayes_p = []
bayes_p_e = []

def make_dataStructure(bayes_p,s):
    for att_n in x_train:
        d = {}
        d2 = {}
        d3 = {}
        d4 = {}
        for att in set(x_train[att_n]):
            d.update({att:s})
            d2.update({att:s})
            d3.update({att:s})
            d4.update({att:s})
        bayes_p.append({"1":d, "2":d2, "3":d3, "4":d4})
        
make_dataStructure(bayes_p,0)
make_dataStructure(bayes_p_e,0)

In [505]:
print(bayes_p[0])
print(bayes_p_e[0])

{'1': {1: 0, 2: 0, 3: 0, 4: 0}, '2': {1: 0, 2: 0, 3: 0, 4: 0}, '3': {1: 0, 2: 0, 3: 0, 4: 0}, '4': {1: 0, 2: 0, 3: 0, 4: 0}}
{'1': {1: 0, 2: 0, 3: 0, 4: 0}, '2': {1: 0, 2: 0, 3: 0, 4: 0}, '3': {1: 0, 2: 0, 3: 0, 4: 0}, '4': {1: 0, 2: 0, 3: 0, 4: 0}}


In [513]:
# b) count to get probabilities

def count(bayes_p):
    for column_n in range(len(x_train.columns)): #every column
        n = 0
        for att in x_train[column_n+1]:            #every row
            bayes_p[column_n][str(y_train[n])][att]+=1
            n+=1
count(bayes_p)
count(bayes_p_e)
        


In [507]:
print(bayes_p[0])
print(bayes_p_e[0])

label_counter = Counter(x_train[1])
print(label_counter)

{'1': {1: 2, 2: 0, 3: 0, 4: 0}, '2': {1: 0, 2: 35, 3: 24, 4: 13}, '3': {1: 0, 2: 27, 3: 13, 4: 15}, '4': {1: 0, 2: 0, 3: 4, 4: 0}}
{'1': {1: 2, 2: 0, 3: 0, 4: 0}, '2': {1: 0, 2: 35, 3: 24, 4: 13}, '3': {1: 0, 2: 27, 3: 13, 4: 15}, '4': {1: 0, 2: 0, 3: 4, 4: 0}}
Counter({2: 62, 3: 41, 4: 28, 1: 2})


In [None]:
# print(x_train.head(2))
# print(range(len(x_train.columns)))
# for i in range(len(x_train.columns)):
#     print(x_train[i+1].head(2))
#     print(set(x_train[i+1]))
#     print(bayes_p[i])

## 3) Predict

In [514]:
predictions_addk = []
predictions_e = []

print(predictions_addk[0:10])
print(predictions_e[0:10])

def predict(test_dataStructure, pred_list,e, smth, k):
    for instance in x_test.iterrows():
        column_n = 0
        instance_p_meta = np.log(p_meta)
        instance_p_malign = np.log(p_malign)
        instance_p_fib = np.log(p_fib)
        instance_p_normal = np.log(p_normal)
        li = []

        for att in instance[1]:
            if smth=='addk':
                mt = test_dataStructure[column_n]['2'][att] 
                ml = test_dataStructure[column_n]['3'][att]
                fb = test_dataStructure[column_n]['4'][att]
                n = test_dataStructure[column_n]['1'][att]
                pmt = (mt+k)/(n_meta+k*len(set(test_dataStructure[column_n]['2'])))
                pml = (ml+k)/(n_malign+k*len(set(test_dataStructure[column_n]['3'])))
                pfb = (fb+k)/(n_fib+(k*len(set(test_dataStructure[column_n]['4']))))
                pn = (n+k)/(n_normal+k*len(set(test_dataStructure[column_n]['1'])))
            else:
                try:
                    mt = test_dataStructure[column_n]['2'][att] 
                    ml = test_dataStructure[column_n]['3'][att]
                    fb = test_dataStructure[column_n]['4'][att]
                    n = test_dataStructure[column_n]['1'][att]
                    pmt = mt/n_meta
                    pml = ml/n_malign
                    pfb = fb/n_fib
                    pn = n/n_normal
                except:
                    pmt = e
                    pml = e
                    pfb = e
                    pn = e           

            instance_p_meta += np.log(pmt)
            instance_p_malign += np.log(pml)
            instance_p_fib += np.log(pfb)
            instance_p_normal += np.log(pn)

            column_n+=1

        li.append(instance_p_meta)
        li.append(instance_p_malign)
        li.append(instance_p_fib)
        li.append(instance_p_normal)

        if instance_p_meta==max(li):
            pred_list.append('2')
        elif instance_p_malign==max(li):
            pred_list.append('3')
        elif instance_p_fib==max(li):
            pred_list.append('4')
        elif instance_p_normal==max(li):
            pred_list.append('1')
            
predict(bayes_p, predictions_addk, 2,'addk',2)
predict(bayes_p_e, predictions_e, 0.00001,'e',0)
#note: the values of the 3,4 and 5th params depend on the version of smoothing and may be irrelevant

print(predictions_addk[0:10])
print(predictions_e[0:10])

[]
[]
['3', '2', '2', '2', '2', '3', '2', '3', '3', '2']
['3', '2', '2', '2', '2', '3', '3', '3', '3', '2']




## 4) Evaluate:

In [515]:
# calculate accuracy and compare with 0R's accuracy

# make 0R and calculate accuracy
y_test = list(map(str, y_test))
zero_r = []
for i in y_test:
    zero_r.append('2')

correct = 0
for i in range(len(zero_r)):
    if zero_r[i]==y_test[i]: correct+=1
print(correct)
    
accuracy_zero_r = correct/(len(y_test))
print("0R's accuracy: {}" .format(accuracy_zero_r))

#calculate predictions accuracy

def accuracy(p):
    correct = 0
    for i in range(len(p)):
        if p[i]==y_test[i]: correct+=1 
    print(correct)
    accuracy = correct/(len(y_test))
    print("model's accuracy: {}" .format(accuracy))
    
accuracy(predictions_addk)
accuracy(predictions_e)


7
0R's accuracy: 0.4666666666666667
12
model's accuracy: 0.8
11
model's accuracy: 0.7333333333333333


In [501]:
# for instance in x_test.iterrows():
#     column_n = 0
#     for att in instance[1]:
#         print("att: {}".format(att))
#         print("bayes_p[{}]: {}".format(column_n,bayes_p[column_n]))
#         print("bayes_p[{}]['2']: {}".format(column_n,bayes_p[column_n]['2']))
#         print("bayes_p[{}]['2'][{}]: {}\n".format(column_n,att,bayes_p[column_n]['2'][att]))
#         column_n+=1

In [497]:
print(bayes_p[0]['2'][2])
print(bayes_p_e[0]['2'][2])

34
34
