<a href="https://colab.research.google.com/github/Ghazaleh99/Naive-Bayes/blob/main/Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementing Naive Bayes from Scratch in Python
 WITH Binary values and 5-fold Cross-Validation
## Machine Learning course


---

**Ghazaleh**

##Step 1 - Import the libraries

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

##Step 2 - Setting up the Data

In [None]:
number_of_features = 16 
h = []
for i in range(number_of_features):
  h.append(f'f{i}')
h.append('t')
df = pd.read_csv("vote.txt", sep = ',', names = h)

### Step 2.1 - Data Frame table

In [None]:
df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,t
0,1,0,1,0,1,1,0,0,1,1,0,0,1,1,0,1,1
1,0,0,1,1,1,1,1,1,0,1,0,0,0,1,0,1,0
2,1,1,0,1,1,1,0,0,0,1,0,1,1,1,0,1,0
3,0,1,1,0,0,0,0,1,1,0,1,0,0,1,1,1,1
4,1,0,1,0,0,0,1,1,1,0,1,0,0,0,1,1,1


### Step 2.2 - Data Frame info

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   f0      325 non-null    int64
 1   f1      325 non-null    int64
 2   f2      325 non-null    int64
 3   f3      325 non-null    int64
 4   f4      325 non-null    int64
 5   f5      325 non-null    int64
 6   f6      325 non-null    int64
 7   f7      325 non-null    int64
 8   f8      325 non-null    int64
 9   f9      325 non-null    int64
 10  f10     325 non-null    int64
 11  f11     325 non-null    int64
 12  f12     325 non-null    int64
 13  f13     325 non-null    int64
 14  f14     325 non-null    int64
 15  f15     325 non-null    int64
 16  t       325 non-null    int64
dtypes: int64(17)
memory usage: 43.3 KB


### Step 2.3 - Splitting dataset to 5 folds

## Step 3 - Train 

#### calculate probability for each feature

This function returns:

*   probabilty for 'x' of feature 'f'




In [None]:
def feature_probabilty(df, feature, t, y):
  sumf, sumt = 1, 2
  for i in range(len(df)):
    if df['t'][i] == t:
      sumt += 1
      if df[feature][i] == y:
        sumf += 1
  return sumf/sumt

#### Function that returns a dictionary of all features probabilities

In [None]:
def probabilty_calc(df1):
  dict_example = {}
  for j in df1:
    if j == 't':
      break
    # y = 0
    dict_example[f'p{j}x0_0'] = feature_probabilty(df1, j, 0, 0) # x = 0
    dict_example[f'p{j}x1_0'] = 1 - dict_example[f'p{j}x0_0'] # x = 1
    # y = 1
    dict_example[f'p{j}x0_1'] = feature_probabilty(df1, j, 0, 1) # x = 0
    dict_example[f'p{j}x1_1'] = 1 - dict_example[f'p{j}x0_1'] # x = 1
  return dict_example 

In [None]:
def predict(probabilty_dict, df_test):
  predict_ans = []
  error = 0
  for i in range(len(df_test)):
    product0, product1 = 1, 1
    for j in df_test:
      if j == 't':
        break
      if df_test[j][i]:
        tmp0 = probabilty_dict[f'p{j}x1_0']
        tmp1 = probabilty_dict[f'p{j}x1_1']
      else:
        tmp0 = probabilty_dict[f'p{j}x0_0']
        tmp1 = probabilty_dict[f'p{j}x0_1']
      product0 = product0 * tmp0
      product1 = product1 * tmp1

    if product0 > product1:
      predict_ans.append(0)
    else:
      predict_ans.append(1)

    if df_test['t'][i] != predict_ans[-1]:
      error += 1

  return predict_ans, error/len(df_test)

In [None]:
def naive(df):
  k_fold = 5
  m = len(df) // k_fold
  # predict_target = []
  error_avg = 0
  for i in range(k_fold):

    df_test = df[::][m*(i):m*(i+1)]
    df_test = df_test.reset_index()
    df_test = df_test.drop(columns=['index'])
    # print(df_test)

    df_tmp = df[::][:m*(i)]
    df_train = df_tmp.append(df[::][m*(i+1):], ignore_index=True)
    df_train = df_train.reset_index()
    df_train = df_train.drop(columns=['index'])
    
    probabilty_dict = probabilty_calc(df_train)
    predict_ans, error = predict(probabilty_dict, df_test)
    error_avg += error
    # predict_target.append(predict_ans)
    print(f'fold {i+1} error:', error)
  print('Average error rate is', error_avg/k_fold)

In [None]:
naive(df)

fold 1 error: 0.18461538461538463
fold 2 error: 0.06153846153846154
fold 3 error: 0.12307692307692308
fold 4 error: 0.15384615384615385
fold 5 error: 0.16923076923076924
Average error rate is 0.13846153846153847


##  Step 4 - Weighted Naive Bayes classifier

In [None]:
def predict_weighted(probabilty_dict, df_test):
  predict_ans = []
  error = 0
  
  for i in range(len(df_test)):
    print(f'test {i} :')

    product0, product1 = 1, 1
    f_importance = {}
    for j in df_test:
      if j == 't':
        break
      alpha = probabilty_dict[f'p{j}x1_1']
      beta = probabilty_dict[f'p{j}x1_0']
      f_importance[j] = math.log(alpha/(1-alpha)) - math.log(beta/(1-beta))
      tmp1_x = alpha*df_test[j][i] + (1-alpha)*(1-df_test[j][i])
      tmp0_x = beta*df_test[j][i] + (1-beta)*(1-df_test[j][i])
      product0 = product0 * tmp0_x
      product1 = product1 * tmp1_x
    print(f_importance) # feature importance
    if product0 > product1:
      predict_ans.append(0)
    else:
      predict_ans.append(1)

    if df_test['t'][i] != predict_ans[-1]:
      error += 1

  return predict_ans, error/len(df_test)

In [None]:
def naive_weighted(df):
  k_fold = 5
  m = len(df) // k_fold
  # predict_target_weighted = []
  error_avg_weighted = 0

  for i in range(k_fold):

    df_test = df[::][m*(i):m*(i+1)]
    df_test = df_test.reset_index()
    df_test = df_test.drop(columns=['index'])

    df_tmp = df[::][:m*(i)]
    df_train = df_tmp.append(df[::][m*(i+1):], ignore_index=True)
    df_train = df_train.reset_index()
    df_train = df_train.drop(columns=['index'])
    
    probabilty_dict = probabilty_calc(df_train)
    predict_ans_weighted, error_weighted = predict_weighted(probabilty_dict, df_test)
    error_avg_weighted += error_weighted
    # predict_target_weighted.append(predict_ans_weighted)
    print(f'fold {i+1} error weighted:', error_weighted)
    print('_____________________________________________________________________')
  print('Average error rate is', error_avg_weighted/k_fold)

In [None]:
naive_weighted(df)

test 0 :
{'f0': 1.8485178030466636, 'f1': -0.5488736914035204, 'f2': 2.716246968306388, 'f3': -7.5224002313871265, 'f4': -4.605170185988091, 'f5': -3.3300155271778227, 'f6': 1.0216512475319814, 'f7': 2.716246968306388, 'f8': 3.0081547935525483, 'f9': -0.5488736914035204, 'f10': 2.716246968306388, 'f11': -3.0081547935525483, 'f12': -3.3300155271778227, 'f13': -6.688077935644415, 'f14': 3.691653380996662, 'f15': -1.7380756940472195}
test 1 :
{'f0': 1.8485178030466636, 'f1': -0.5488736914035204, 'f2': 2.716246968306388, 'f3': -7.5224002313871265, 'f4': -4.605170185988091, 'f5': -3.3300155271778227, 'f6': 1.0216512475319814, 'f7': 2.716246968306388, 'f8': 3.0081547935525483, 'f9': -0.5488736914035204, 'f10': 2.716246968306388, 'f11': -3.0081547935525483, 'f12': -3.3300155271778227, 'f13': -6.688077935644415, 'f14': 3.691653380996662, 'f15': -1.7380756940472195}
test 2 :
{'f0': 1.8485178030466636, 'f1': -0.5488736914035204, 'f2': 2.716246968306388, 'f3': -7.5224002313871265, 'f4': -4.605170