# Naive Bayes algorithm

In [47]:
# import libraries
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import time
import jax.numpy as jnp
import jax
from random import seed
from random import randrange
from math import pi
from math import exp

Five main steps:

1. separate all the rows by class (divide the dataset in more datasets, one for each class);
2. summarize dataset, namely compute mean, variance and cardinality of each column of the dataset;
3. summarize the dataset by class, namely do the same as 2. but for each dataset created at 1.;
4. define the probability density function with which modelling the distribution of each column for each class (strong assumption);
5. compute the class probabilities by applying the Bayes theorem (simplified).




The main goal is to apply the formula:

$p(y_i|x_1,x_2,...,x_n) = p(x_1|y_i)p(x_2|y_i)...p(x_n|y_i)p(y_i)$

which is the Bayes theorem simplified with two assumptions:
1. the variables($x_j$) are independent;
2. since the denominator, under 1., becomes a constant ($p(x_1)p(x_2)...p(x_n)$), we can remove it, since we care only about the maximum value, and not about the probability itself.

## Separate by class

We can create a dictionary object where each key is the class value and each object is a list of all the records with that class value; the structure is:

separated = {class_value1: [entry1, entry2,...], class_value2: [entry1, entry2,...],}

where class_valuei represents the i-th class value, and the list contains all entries of the dataset with that class value.

In [48]:
def separate_by_class(dataset):
  """
  It returns a dictionary whose keys are the class values and whose objects are lists
  of entries of the dataset with the proper class value.
  
  @param: dataset It is a list of lists, where each inner list is a sample.
  """

  separated = dict()

  # for each entry of the dataset, add it to the list associated to its class value
  for i in range(len(dataset)):
    vector = dataset[i]
    class_value = vector[-1] # the class is in the last column

    # if the class value is not present in the dictionary, add it as a key along with a new list
    if (class_value not in separated):
      separated[class_value] = list()

    # add the entry to the dictionary
    separated[class_value].append(vector)

  return separated

## Summarize dataset

In order to compute all the probabilities we need to make a prediction, we need to know mean, standard deviation and cardinality of each column:

In [49]:
# mean
def mean(numbers):
  '''It calculates the mean of a list of numbers.
  
  @param: numbers It is a list of numbers.'''

  return sum(numbers)/float(len(numbers))

In [50]:
# standard deviation
def stdev(numbers):
  '''It calculates the standard deviation of a list of numbers.
  
  @param: numbers It is a list of numbers.'''

  avg = mean(numbers)
  variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
  return sqrt(variance)

In [51]:
# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
  """
  It returns a list of triples. The returned summaries contains one triple for each
  column of dataset (except for the last column); each triple contains (mean, std, len).
  
  @param: dataset It is a list of lists, where each inner list is a sample of the dataset.
  """
  
  # through the asterisk and the zip function I can create an iterable of lists
  # whose elements are the columns of the dataset
  summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]

  # summaries is a list; I delete the last element, which are the statistics for the class column
  del(summaries[-1])
  
  return summaries

## Summarize data by class

We require statistics from our training dataset organized by class:

In [52]:
def summarize_by_class(dataset):
  """
  It returns a dictionary where the keys are the class values and the objects are
  lists of triples. So, to each class value (key) we associate a list of triples, where
  each triple contains (mean,std,len) of a column. The cardinality of each list is equal
  to the number of columns (features xi) of the dataset.

  @param: dataset It is a list of lists, where each inner list is a sample.
  """

  # get a dictionary class_value -> entries
  separated = separate_by_class(dataset)

  summaries = dict()

  # for each list of entries of a class value, compute the statistics and store them into summaries
  for class_value, rows in separated.items():
    summaries[class_value] = summarize_dataset(rows)
    
  return summaries

## Probability Density Function

Since calculating the probability of observing a certain real-value given the label is difficult, we assume that the distribution is known.

In [53]:
def calculate_probability(x, mean, stdev):
  """
  It computes the Gaussian probability distribution function for a value x
  given the mean and the standard deviation.

  @param: mean It is the mean of the distribution.
  @param: stdev It is the standard deviation of the distribution.
  """
  
  exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
  return (1 / (sqrt(2 * pi) * stdev)) * exponent

## Class Probabilities

Given a certain sample for which we want to compute the most likely label, we compute the value of:

$p(y_i|x_1,x_2,...,x_n) = p(x_1|y_i)p(x_2|y_i)...p(x_n|y_i)p(y_i)$

for each of the possible class values.

In [54]:
def calculate_class_probabilities(summaries, sample):
  """
  It computes a dictionary whose keys are the class values yi and whose objects are
  the values of p(yi|x1,x2,...xn). To do so, it applies the Bayes formula under
  the independency assumption of the features.

  @param: summaries It is a dictionary whose keys are the class values and whose objects
          are lists of triples, where each triple represents the statistics of a column.
  @param: sample It is one sample, one row of the original dataset on which we want to
          make a prediction.
  """

  # compute the total number of rows in the dataset
  total_rows = sum([summaries[class_value][0][2] for class_value in summaries])

  probabilities = dict()

  # for each class value yi, compute the value of p(yi|x1,x2,...xn) and add it to probabilities.
  for class_value, class_summaries in summaries.items():
    
    # p(yi) = n_samples_of_the_class / total_n_of_samples
    probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)

    # for each of the columns (feature xi) of the dataset (the last was previously removed)
    for i in range(len(class_summaries)):
      mean, stdev, count = class_summaries[i]

      # the probability p(yi|x1,x2,...,xn)=p(yi)*p(x1|yi)*p(x2|yi)*...*p(xn|yi)
      probabilities[class_value] *= calculate_probability(sample[i], mean, stdev)
   
  # we return a dictionary that contains, for each class value, the probability:
  # class_value yi -> p(yi|x1,x2,...xn)
  return probabilities

## Make predictions

Select the class with highest probability to make a prediction:

In [55]:
# Predict the class for a given sample
def predict(summaries, sample):
	# compute the possibility yi -> p(yi|x1,x2,...xn) for each class yi
	probabilities = calculate_class_probabilities(summaries, sample)
	best_label, best_prob = None, -1

  # extract the class with maximum value
	for class_value, probability in probabilities.items():
		if best_label is None or probability > best_prob:
			best_prob = probability
			best_label = class_value
			
	return best_label

## Algorithm

Compute a prediction for each sample in the test set; the training set is used only at the beginning to learn the distributions of all the $p(y_i)$ and $p(x_j|y_i)$.

In [56]:
# Naive Bayes Algorithm
def naive_bayes(training_set, test_set):

  # I compute all the statistics needed on the training_set
  summarize = summarize_by_class(training_set)

  predictions = list()

  # for each row in the test_set I compute the prediction and I append it in predictions list
  for row in test_set:
    output = predict(summarize, row)
    predictions.append(output)

  # return a list with all the predictions
  return(predictions)

# Preprocess data - Random under sampling

The two main approaches to randomly resampling an imbalanced dataset are to delete examples from the majority class, called undersampling, and to duplicate examples from the minority class, called oversampling.


*   Random oversampling duplicates examples from the minority class in the training dataset and can result in overfitting for some models.
*   Random undersampling deletes examples from the majority class and can result in losing information invaluable to a model.

Random oversampling involves randomly selecting examples from the minority class, with replacement, and adding them to the training dataset. Random undersampling involves randomly selecting examples from the majority class and deleting them from the training dataset.



In [57]:
def RUS(ratio, set):
  '''It returns a list obtained by merging two lists, one containing the fraud set and the
  other containing the non-fraud set.
  The ratio parameter defines the ratio fraud/non-fraud.
  e.g.: ratio=1 and len(fraud)= 100 implies that len(new_not_fraud)=100.
  
  @param: ratio It is the ratio between fraud and non-fraud data; it is defined as: ratio:=fraud_set/non-fraud_set.
  @param set It is the set to split.
  '''

  # initialize the two lists
  fraud_set = list()
  non_fraud_set = list()

  # for each sample in the set, append it to the proper class
  for sample in set:
    if sample[30] == 1:
      fraud_set.append(sample)
    else:
      non_fraud_set.append(sample)

  # compute new length of non_fraud_set
  new_length_non_fraud_set = int(len(fraud_set) / ratio)

  # shuffle non_fraud_set
  np.random.seed(0) # set seed to 0 for reproducibility
  np.random.shuffle(non_fraud_set)

  # take the first new_length_non_fraud_set elements from non_fraud_set
  non_fraud_reduced_set = non_fraud_set[0:new_length_non_fraud_set]

  # create a unique list for the new set
  new_set = fraud_set
  new_set.extend(non_fraud_reduced_set)

  # return the whole new set
  return new_set

# Load the dataset and split it into training and test set

First step is the definition of the hyperparameters.

In [58]:
# test_set / training_set
fraction_validation = 0.3 # 30% of test set and 70% of training set

# inside the training_set: fraud / non-fraud
ratio1 = 1 # 50:50
ratio2 = 34/66 # 34:66
ratio3 = 1/3 # 25:75

# hyperparameter k
k = 10

Load the dataset and divide it:

In [59]:
# load dataset
dataset = pd.read_csv('creditcard.csv')

# normalize it
# TODO

# parse to numpy and shuffle it
dataset = dataset.to_numpy() # not normalized
np.random.seed(10)
np.random.shuffle(dataset) # shuffle the rows

# divide into training_set and test_set
num_train = int(dataset.shape[0] * (1 - fraction_validation))
training_set = dataset[:num_train,:] # take the first num_train rows
test_set = dataset[num_train:,:] # take the last tot-num_train rows

# apply random under sampling to the training set
training_set1 = RUS(ratio1, training_set)
training_set2 = RUS(ratio2, training_set)
training_set3 = RUS(ratio3, training_set)

# apply random under sampling to the test set
test_set1 = RUS(ratio1, test_set)
test_set2 = RUS(ratio2, test_set)
test_set3 = RUS(ratio3, test_set)

# Apply the algorithm

In [60]:
# proportion 1: 50:50
predictions1 = naive_bayes(training_set1, test_set1)

In [61]:
# proportion 2: 34:66
predictions2 = naive_bayes(training_set2, test_set2)

In [62]:
# proportion 3: 25:75
predictions3 = naive_bayes(training_set3, test_set3)

# Evaluate algorithm through cross-validation

**Cross validation:**

Cross-validation, sometimes called rotation estimation or out-of-sample testing, is any of various similar model validation techniques for assessing how the results of a statistical analysis will generalize to an independent data set. Cross-validation is a resampling method that uses different portions of the data to test and train a model on different iterations. It is mainly used in settings where the goal is prediction, and one wants to estimate how accurately a predictive model will perform in practice.

**k-fold cross-validation:**

In k-fold cross-validation, the original sample is randomly partitioned into k equal sized subsamples. Of the k subsamples, a single subsample is retained as the validation data for testing the model, and the remaining k − 1 subsamples are used as training data. The cross-validation process is then repeated k times, with each of the k subsamples used exactly once as the validation data. The k results can then be averaged to produce a single estimation. k = 10 is usually used.

Create functions for evaluating the algorithm:

In [63]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
  '''It returns the n_folds folds.
  
  @param: dataset It is the dataset to split.
  @param: n_folds The number of folds to create.
  '''

  # the returned value will be a list of lists (folds)
  dataset_split = list()

  # create a list from the dataset
  dataset_copy = dataset.tolist()

  # compute the number of elements in each fold
  fold_size = int(len(dataset) / n_folds)

  # build each fold by popping values from the copy of the dataset
  for _ in range(n_folds):
  	fold = list()
    # build the fold
  	while len(fold) < fold_size:
  		index = randrange(len(dataset_copy))
  		fold.append(dataset_copy.pop(index))
    # append the fold
  	dataset_split.append(fold)
   
  return dataset_split

In [64]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	"""
	It computes the accuracy between the two provided lists.
	Accuracy := n_correct / n_actual

	@param: actual It is the list with all the actual values.
	@param: predicted It is the list with all the predicted values.
	"""

	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	
	return correct / float(len(actual)) * 100.0

In [65]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  '''It returns a list of accuracies, each of them computed on a certain fold.
  
  @param: dataset It is the dataset on which evaluating the algorithm.
  @param: algorithm It is the algorithm to evaluate; it is a lambda.
  @param: n_folds It represents the number of folds on which divide the dataset.
  @param: args It contains parameters of the algorithm (k in the case of KNN).
  '''

  # split the dataset into n_folds
  folds = cross_validation_split(dataset, n_folds)

  # this list contains the accuracy of the algorithm computed for each fold at a time
  scores = list()

  # for each fold compute the accuracy
  for fold in folds:
    training_set = list(folds) # copy the dataset
    training_set.remove(fold) # remove the current fold
    training_set = sum(training_set, []) # merge all the folds in training_set in one list
    test_set = list()

    # build the test_set by removing the class in each row of the current fold
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None

    # compute the accuracy and append it to scores
    predicted = algorithm(training_set, test_set, *args)
    actual = [row[-1] for row in fold] # actual values are in the last column of the fold

    # compute the accuracy
    accuracy = accuracy_metric(actual, predicted)    
    scores.append(accuracy)                 

  return scores

Evaluate the algorithm:

In [66]:
n_folds = 3
rus_dataset = RUS(0.5,dataset)
rus_dataset = np.asarray(rus_dataset)

# I pass the result of the RUS function on the dataset
scores = evaluate_algorithm(rus_dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [93.08943089430895, 92.6829268292683, 93.4959349593496]
Mean Accuracy: 93.089%
