In [2]:
# -*- coding: utf-8 -*-
"""NaiveBayes.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1wZuiMBQrwFC7ugwK_M6UgJ-7PcT9HgCH

# Importing Libraries
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold
import string
import sys
from collections import defaultdict
from random import randrange
from tqdm import tqdm

"""# Importing Data"""

data = pd.read_csv('a1_data/a1_d3.txt',delimiter='\n',header = None)

data = data[0].str.split('\t',expand=True)

data.head()

X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,stratify=y)

"""# Naive Bayes Model"""

class NaiveBayes:

  def __init__(self):
    self.vocab = None
    self.prior = defaultdict(int)
    self.classes = None
    self.class_all_data = defaultdict(list)
    self.count = None
    self.logliklihood = None

  def compute_vocab(self,X):
    self.vocab = set()
    
    for line in X:
      #print(line[0])
      tokens = list(line[0].split())
      table = str.maketrans('', '', string.punctuation)
      stripped = [w.translate(table) for w in tokens]
      l = list(word.lower() for word in stripped)
      (self.vocab).update(l)
      #print(self.vocab)

  def all_data(self,X,y):
    for line,j in zip(X,y):
      tokens = list(line[0].split())
      table = str.maketrans('', '', string.punctuation)
      stripped = [w.translate(table) for w in tokens]
      l = list(word.lower() for word in stripped)
      self.class_all_data[j].extend(l)

  def fit(self,X,y,smooth=1):
    N = len(X)
    self.compute_vocab(X)
    self.classes = set(y)
    
    #make pool with respect to classes
    self.all_data(X,y)

    self.count = {}
    self.logliklihood = self.count
    for c in self.classes:
      temp = list(self.vocab)
      temp = dict.fromkeys(temp,0)
      for word in self.vocab:
        temp[word] = self.class_all_data[c].count(word)
      self.count[c]=temp

    for c in self.classes:
      N_c = float(sum(y==c))
      self.prior[c] = np.log(N_c/N)

      for word in self.vocab:
        temp = self.count[c]
        temp[word] = np.log(( self.count[c][word] + smooth ) / ( len(self.count[c])+ smooth*len(self.vocab)))
      self.logliklihood[c]=temp

  
  def predict(self,X):
    if self.vocab==None:
      print('Train Model First')
      sys.exit(0)
    y_pred = []
    for line in X:
      sums = dict.fromkeys(list(self.classes),0)
      for c in self.classes:
        sums[c] += self.prior[c]
      
      tokens = list(line[0].split())
      table = str.maketrans('', '', string.punctuation)
      stripped = [w.translate(table) for w in tokens]
      l = list(word.lower() for word in stripped)
      for word in l:
        if word in self.vocab:
          for c in self.classes:
            sums[c] += self.logliklihood[c][word]
      y_pred.append(max(sums,key = sums.get))
    return np.asarray(y_pred)
  
  def score(X_test, y_test):
    y_predicted = self.predict(X_test)
    n = len(y_actual)
    y_actual = y_actual.astype(int)
    y_predicted = y_predicted.astype(int)
    correct = n - np.sum(np.absolute(y_actual - y_predicted))
    return correct/n

model = NaiveBayes()

X_train.shape

model.fit(X_train,y_train)

y_train_pred = model.predict(X_train)

y_test_pred = model.predict(X_test)

"""# Performance Measure"""

def accuracy(y_actual,y_predicted):
  n = len(y_actual)
  y_actual = y_actual.astype(int)
  y_predicted = y_predicted.astype(int)
  correct = n - np.sum(np.absolute(y_actual - y_predicted))
  return correct/n

accuracy(y_train,y_train_pred)

accuracy(y_test,y_test_pred)

def confusion(y_actual,y_predicted):
  confusion_mat = np.zeros((2,2))
  for i in range(len(y_actual)):
    if y_actual[i]==1 and y_predicted[i]==1:
      confusion_mat[0,0]=confusion_mat[0,0]+1
    elif y_actual[i]==0 and y_predicted[i]==0:
      confusion_mat[1,1]=confusion_mat[1,1]+1
    elif y_actual[i]==0 and y_predicted[i]==1:
      confusion_mat[1,0]=confusion_mat[1,0]+1
    else:
      confusion_mat[0,1]=confusion_mat[0,1]+1
  return confusion_mat

def f_score(y_actual,y_predicted):
  y_actual = y_actual.astype(int)
  y_predicted = y_predicted.astype(int)
  cm = confusion(y_actual,y_predicted)
  #print(cm)
  Precision = cm[0,0]/(cm[0,0]+cm[0,1]) 
  Recall = cm[0,0]/(cm[0,0]+cm[1,0])
  fscore =  (2 * Precision * Recall) / (Precision + Recall)
  return fscore

def cross_validation_split(dataset, fold):
  dataset_split = list()
  dataset_copy = list(dataset)
  fold_size = int(len(dataset) / fold)
  for i in range(fold):
    fold = list()
    while len(fold) < fold_size:
      index = randrange(len(dataset_copy))
      fold.append(dataset_copy.pop(index))
    dataset_split.append(fold)
  return dataset_split

def k_folds_eval(data,folds=5):
  data = list(data.iloc[:,:].values)
  splits = cross_validation_split(data,fold=folds)
  accu = []
  f_sco = []
  for i in tqdm(range(folds)):
    train = []
    test = splits[i]
    for x in range(folds):
      if x != i:
        train.extend(splits[i])
    train = np.asarray(train)
    test = np.asarray(test)
    X_train,y_train = np.hsplit(train,2)
    X_test,y_test = np.hsplit(test,2)
    model = NaiveBayes()
    y_train = y_train.flatten()
    y_test = y_test.flatten()

    model.fit(X_train,y_train)
    y_pred_test = model.predict(X_test)
    accu.append(accuracy(y_test,y_pred_test))
    f_sco.append(f_score(y_test,y_pred_test))
  accu = np.asarray(accu)
  f_sco = np.asarray(f_sco)
  print('\nAccuracy')
  print(np.mean(accu),' ',u'\u00b1',' ',np.std(accu))
  print('\nF-score')
  print(np.mean(f_sco),' ',u'\u00b1',' ',np.std(f_sco))

k_folds_eval(data)

datat = list(data.iloc[:,:].values)
qwer = cross_validation_split(datat,5)

len(qwer[1])
naivebayes.py
#Displaying naivebayes.py.

FileNotFoundError: [Errno 2] File b'a1_d3.txt' does not exist: b'a1_d3.txt'