Implement the Naive Bayes and K-Nearest Neighbours algorithms from scratch in Python using Numpy and Pandas and Matplotlib for visualization.

The algorithm must be implemented as a function with arguments, x_train (the features) and y_train (the output).  Usage of any library that has an implementation is forbidden.



In [3]:
import numpy as np
import pandas as pd
from collections import Counter
from scipy.stats import norm

def naive_bayes(x_train, y_train):
    # Calculate the prior probabilities for each class
    priors = {}
    for c in np.unique(y_train):
        priors[c] = np.mean(y_train == c)
    
    # Calculate the mean and standard deviation for each feature for each class
    means = x_train.groupby(y_train).apply(np.mean)
    stds = x_train.groupby(y_train).apply(np.std)
    
    # Define a function to calculate the likelihood of a data point belonging to a class
    def likelihood(x, c):
        prob = 1
        for col in x_train.columns:
            prob *= norm.pdf(x[col], means[col][c], stds[col][c])
        return prob
    
    # Define a function to make predictions
    def predict(x):
        probs = {}
        for c in np.unique(y_train):
            probs[c] = priors[c] * likelihood(x, c)
        return max(probs, key=probs.get)
    
    return predict

def k_nearest_neighbors(x_train, y_train, k=3):
    # Define a function to calculate the distance between two data points
    def distance(x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    # Define a function to make predictions
    def predict(x):
        distances = []
        for i in range(len(x_train)):
            d = distance(x, x_train.iloc[i])
            distances.append((d, y_train.iloc[i]))
        distances.sort(key=lambda x: x[0])
        neighbors = [d[1] for d in distances[:k]]
        return Counter(neighbors).most_common(1)[0][0]
    
    return predict


You can use these functions to make predictions on new data. For example:

In [4]:
# Load the data
data = pd.read_csv('test.csv')
x_train = data.drop('output', axis=1)
y_train = data['output']

# Train the models
nb_model = naive_bayes(x_train, y_train)
knn_model = k_nearest_neighbors(x_train, y_train)

# Make predictions on new data
x_new = pd.DataFrame({'feature1': [1], 'feature2': [2]})
nb_prediction = nb_model(x_new.iloc[0])
knn_prediction = knn_model(x_new.iloc[0])


KeyError: "['output'] not found in axis"