<a href="https://colab.research.google.com/github/GaiaSaveri/intro-to-ml/blob/main/solved-notebooks/SOLVED-Lab-7.KNNGaussianNaiveBayesTrees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification with KNN, Trees and Gaussian Naive Bayes

In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

Load and split the data from the Unsupervise Learning Dataset (Lab 5, Dry Bean Dataset):

In [None]:
FFILE = './Dry_Bean_Dataset.xlsx'
if os.path.isfile(FFILE): 
    print("File already exists")
    if os.access(FFILE, os.R_OK):
        print ("File is readable")
    else:
        print ("File is not readable, removing it and downloading again")
        !rm FFILE
        !wget "https://raw.github.com/alexdepremia/ML_IADA_UTs/main/Lab5/Dry_Bean_Dataset.xlsx"
else:
    print("Either the file is missing or not readable, download it")
    !wget "https://raw.github.com/alexdepremia/ML_IADA_UTs/main/Lab5/Dry_Bean_Dataset.xlsx"

In [None]:
# Load the data
data = pd.read_excel('./Dry_Bean_Dataset.xlsx')
data.head()

Divide features and label. Split the data in train and test set and **after that** normalize them:

In [None]:
data = data.sample(frac=1,random_state=0).reset_index(drop=True) # random shuffle
data.head()   

In [95]:
train_data = data.iloc[:10000,:]
test_data = data.iloc[10000:,:]

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
# normalize train and test dataset 
from sklearn import preprocessing
label_train = train_data['Class']
train_data = train_data.drop('Class', axis=1)
columns_name = train_data.columns
train_scaler = preprocessing.StandardScaler().fit(train_data)
train_data = train_scaler.transform(train_data)
train_data = pd.DataFrame(train_data, columns=columns_name)
train_data['Class'] = label_train
label_test = test_data['Class']
test_data = test_data.drop('Class', axis=1)
test_scaler = preprocessing.StandardScaler().fit(test_data)
test_data = test_scaler.transform(test_data)
test_data = pd.DataFrame(test_data, columns=columns_name)
test_data['Class'] = label_test

In [None]:
train_data.head()

**Before feeding the data into the following algorithms, try to perform PCA, varying the number of PCs, and check what changes**

## K-Nearest Neighbors Classification 

Implement the KNN algorithm for classification.

In [150]:
from scipy.spatial.distance import euclidean

def distance(point_one, point_two):
    return euclidean(point_one, point_two)

def get_neighbors(train_set, test_point, label_col, n_neighbors):
  dist = np.array([distance(train_point, test_point) for train_point in train_set])
  idx_dist = dist.argsort()
  ordered_train = train_set[idx_dist, :]
  ordered_label = label_col[idx_dist]
  return ordered_train[:n_neighbors], ordered_label[:n_neighbors]

def predict(train_set, test_point, labels, n_neighbors):
  neigh, neigh_label = get_neighbors(train_set, test_point, labels, n_neighbors)
  values, counts = np.unique(neigh_label, return_counts=True)
  idx = np.argmax(counts)
  return values[idx]

def evaluate(train_set, test_set, label, n_neighbors=2):
    correct_preditct = 0
    wrong_preditct = 0
    train_labels = train_set[label].values
    train_set = train_set.drop(label, axis=1)
    test_labels = test_set[label].values
    test_set = test_set.drop(label, axis=1)
    for index in range(len(test_set.index)):  # for each row in the dataset
        result = predict(train_set.values, test_set.iloc[index].values, train_labels, n_neighbors)  # predict the row
        if result == test_labels[index]:  # predicted value and expected value is same or not
            correct_preditct += 1  # increase correct count
        else:
            wrong_preditct += 1  # increase incorrect count
    accuracy = correct_preditct / (correct_preditct + wrong_preditct)  # calculating accuracy
    return accuracy

In [None]:
knn_accuracy = evaluate(train_data, test_data, 'Class')

## Decision Trees with Numerical Features 

Modify the implementation of decision trees to account for numerical input features.

In [100]:
# compute H(S)
def entropy(train_data, label, class_list):
    total_row = train_data.shape[0]  # the total size of the dataset  
    total_entr = 0
    for c in class_list:  # for each possible class in the label
        total_class_count = train_data[train_data[label] == c].shape[0]  # number of points belonging to the class
        if total_class_count > 0:
          total_class_entr = - (total_class_count/total_row)*np.log2(total_class_count/total_row)  # entropy of the class
          total_entr += total_class_entr  # adding the class entropy to the total entropy of the dataset
    return total_entr

In [101]:
# compute H(S_j)
def feature_entropy(left_data, right_data, label, class_list):
    row_count = left_data.shape[0] + right_data.shape[0] # n points considered
    p_left = left_data.shape[0] / row_count
    p_right = right_data.shape[0] / row_count
    ent = p_left * entropy(left_data, label, class_list) + p_right * entropy(right_data, label, class_list)
    return ent

In [102]:
def split(feature_column, threshold):
  left_rows = np.argwhere(feature_column <= threshold).flatten()
  right_rows = np.argwhere(feature_column > threshold).flatten()
  return left_rows, right_rows

In [103]:
def information_gain(data, feature_name, label, class_list, threshold):
  left_rows, right_rows = split(data[feature_name].values, threshold)
  if len(left_rows)==0 or len(right_rows)==0:
    return 0
  feat_entropy = feature_entropy(data.iloc[left_rows], data.iloc[right_rows], label, class_list)
  return feat_entropy

In [104]:
def get_split_thresholds(feature_column, n_thresholds):
  feature_column = feature_column.values
  n_data = len(feature_column)
  sorted_column = np.sort(feature_column)
  if len(feature_column) > 1:
    partitioned_array = np.array_split(feature_column, n_thresholds + 1)
    thresholds = [(partitioned_array[i][-1] + partitioned_array[i+1][0])/2 for i in range(len(partitioned_array)-1)]
  else:
    thresholds = [feature_column[0]]
  return thresholds

In [105]:
def most_informative_feature(train_data, label, class_list, n_thresholds):
    feature_list = train_data.columns.drop(label)
    min_entropy = 99999
    min_entropy_feature = None
    min_entropy_threshold = None
    for feature in feature_list:
      thresholds = get_split_thresholds(train_data[feature], n_thresholds)
      for t in thresholds:
        info_gain = information_gain(train_data, feature, label, class_list, t)
        if info_gain < min_entropy:
          min_entropy = info_gain
          min_entropy_feature = feature
          min_entropy_threshold = t
    return min_entropy_feature, min_entropy_threshold

In [106]:
def is_leaf(train_data, label):
  classes_in_node = np.unique(train_data[label])
  if len(classes_in_node) == 1:
    return True
  else:
    return False

In [107]:
def leaf_class(train_data, label):
    class_list, count_class = np.unique(train_data[label], return_counts=True)
    idx = count_class.argmax()
    return class_list[idx]

In [108]:
def make_tree(train_data, label, class_list, n_thresholds, cur_depth, min_samples, max_depth):
  if is_leaf(data, label) or cur_depth>=max_depth or len(train_data)<=min_samples:
    return leaf_class(train_data, label)
  else:
    cur_depth += 1
    split_feature, split_threshold = most_informative_feature(train_data, label, class_list, n_thresholds)
    left_rows, right_rows = split(train_data[split_feature].values, split_threshold)
    if len(left_rows)==0 or len(right_rows)==0:
      return leaf_class(train_data, label)
    else:
      # build sub tree
      split_condition = "{} <= {}".format(split_feature, split_threshold)
      sub_tree = {split_condition : []}
      # recursive call
      left_branch = make_tree(train_data.iloc[left_rows], label, class_list, n_thresholds, cur_depth, min_samples, max_depth)
      right_branch = make_tree(train_data.iloc[right_rows], label, class_list, n_thresholds, cur_depth, min_samples, max_depth)
      if left_branch == right_branch:
        sub_tree = left_branch
      else:
        # grow the tree
        sub_tree[split_condition].append(left_branch)
        sub_tree[split_condition].append(right_branch)
      return sub_tree

In [109]:
# id3 call
def id3(train_data_m, label, n_thresholds=1, min_samples=4, max_depth=6):
    train_data = train_data_m.copy()  # getting a copy of the dataset
    class_list = train_data[label].unique()  # getting unqiue classes of the label
    tree = make_tree(train_data, label, class_list, n_thresholds, 0, min_samples, max_depth)  # start calling recursion
    return tree

In [None]:
t = id3(train_data, 'Class')
print(t)

In [122]:
def predict(test_point, tree):
    if not isinstance(tree, dict):
      return tree
    question = list(tree.keys())[0]
    attribute, value = question.split(" <= ")
    if test_point[attribute] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]
    return predict(test_point, answer)

def evaluate(tree, test_data, label):
    correct_preditct = 0
    wrong_preditct = 0
    for index in range(len(test_data.index)):  # for each row in the dataset
        result = predict(test_data.iloc[index], tree)  # predict the row
        if result == test_data[label].iloc[index]:  # predicted value and expected value is same or not
            correct_preditct += 1  # increase correct count
        else:
            wrong_preditct += 1  # increase incorrect count
    accuracy = correct_preditct / (correct_preditct + wrong_preditct)  # calculating accuracy
    return accuracy

## Gaussian Naive Bayes 
Modufy the implemntation of naive Bayes to accout for numerical input features. The likelihood of each class ($p(data|class)$) is assumed to be a Gaussian $\frac{1}{\sqrt(\sigma^2 2 \pi)} \exp (\frac{1}{2} \frac{(x-\mu)}{\sigma^2})$, where $\mu, \sigma^2$ are the mean and the variance for each class;

In [165]:
def prior(train_data, label):
  priors = train_data.groupby(by=label).apply(lambda x: len(x)/len(train_data))
  return np.log(priors).values

def mean_variance(train_data, label):
  mean = train_data.groupby(by=label).apply(lambda x: x.mean(axis=0))
  variance = train_data.groupby(by=label).apply(lambda x: x.var(axis=0))
  return (mean.values, variance.values)

def gaussian_density(mean, variance, point):
  d = (1 / np.sqrt(2*np.pi*variance)) * np.exp((-(point - mean)**2) / (2*variance))
  return d

def train_gaussian_naive_bayes(train_data, label):
  mean, variance = mean_variance(train_data, label)
  priors = prior(train_data, label)
  unique_labels = train_data[label].unique()
  n_labels = len(unique_labels)
  return {'n_labels': n_labels, 'unique_labels': unique_labels, 'n_classes': n_labels, 'mean': mean, 
          'variance': variance, 'prior': priors}

In [None]:
gaus_bayes = train_gaussian_naive_bayes(train_data, 'Class')

In [167]:
def posterior(point, mean, variance, class_list, n_classes, n_feat):
  posteriors = []
  for i in range(n_classes):
    posterior = 0
    for j in range(n_feat):
      posterior += np.log(gaussian_density(mean[i][j], variance[i][j], point[j]))
    posteriors.append(posterior)
  return posteriors

def predict(test_data, label, gaus_bayes):
  predictions = []
  n_feat = len(test_data.columns) - 1
  for i in range(len(test_data)):
    pr = gaus_bayes['prior']
    post = posterior(test_data.iloc[i, :-1], gaus_bayes['mean'], gaus_bayes['variance'], 
                     gaus_bayes['unique_labels'], gaus_bayes['n_classes'], n_feat)
    prob = pr + post
    max_prob_class_idx = np.argmax(prob)
    predictions.append(gaus_bayes['unique_labels'][max_prob_class_idx])
  return predictions 

def evaluate(test_data, label, gaus_bayes):
  gaus_pred = predict(test_data, label, gaus_bayes)
  correct_predict = 0
  wrong_predict = 0
  for index in range(len(test_data.index)):  # for each row in the dataset
        if gaus_pred[index] == test_data[label].iloc[index]:  # predicted value and expected value is same or not
            correct_predict += 1  # increase correct count
        else:
            wrong_predict += 1  # increase incorrect count
  accuracy = correct_predict / (correct_predict + wrong_predict)  # calculating accuracy
  return accuracy