In [1]:
import numpy as np
import math

In [2]:
class Multiclass_Gaussian_Naive_Bayes():
    
    def __init__(self,n_features,n_classes):
        
        self.n_classes = n_classes
        self.n_features = n_features
        self.mean = np.zeros((n_classes,n_features))
        self.variance = np.zeros((n_classes,n_features))
    
    def fit(self,X,counts):
        for i in range(np.size(counts)):
            if i==0:
                st_idx = 0
            else:
                st_idx = np.sum(counts[0:i])
            curr_X = X[st_idx:st_idx+counts[i]]
            self.mean[i] = np.mean(curr_X,axis = 0)
            diff = curr_X - self.mean[i]
            self.variance[i] = np.mean(np.square(diff),axis = 0)

    def gaussian(self,X, mu, var):
        return 1 / ((2 * np.pi) ** (1 / 2) * var ** 0.5) * np.exp(-0.5 * (X-mu)**2/var)

    def predict(self,x_test):
        
        pred = np.zeros(x_test.shape[0])
        for j in range(x_test.shape[0]):
            best_class = 0
            best_likelihood = -math.inf
            for i in range(self.n_classes):
                likelihood = self.gaussian(x_test[j],self.mean[i],self.variance[i])
                log_likelihood = np.sum(np.log(likelihood))
                if best_likelihood < log_likelihood:
                    best_likelihood = log_likelihood
                    best_class = i
            pred[j] = best_class
        return pred
                
    def accuracy(self,pred,actual):
        n_correct_preds = 0
        for i in range(actual.shape[0]):
            if pred[i] == actual[i]:
                n_correct_preds += 1
        accuracy = n_correct_preds/actual.shape[0]
        return accuracy

In [3]:
from scipy.special import softmax

class Multiclass_Logistic_Regression():
    
    def __init__(self, learning_rate,n_classes):
        self.param = None
        self.lr = learning_rate
        self.training_errors = []
        self.eps = 1e-7
        self.n_classes = n_classes

    def initialize_parameters(self, X):
        n_features = np.shape(X)[1]
        self.param = np.ones((self.n_classes,n_features))
    
    def one_hot(self,y):

        return np.eye(self.n_classes)[y.reshape(-1)]
    '''
    def softmax(self,probs):
        probs = probs - (np.mean(probs,axis=1)).reshape(-1,1)   ## normalization of probs
        return np.exp(probs)/(np.sum(np.exp(probs),axis = 1) + self.eps).reshape(-1,1)
    '''

    def fit(self, X, y, n_iterations=1000):
        
        X = np.insert(X,0,1,axis=1)
        y = self.one_hot(y)
        self.initialize_parameters(X)
        loss_per_iter = []
        for i in range(n_iterations):
            y_pred = softmax(np.dot(X,self.param.T),axis = 1)
            loss = -1*np.mean(y*np.log(y_pred + self.eps))
            loss_per_iter.append(loss)
            grad = np.dot((y_pred-y).T,X)
            self.param = self.param - self.lr*grad
    
    def predict(self, X):
        
        X = np.insert(X,0,1,axis=1)
        y_pred = np.argmax(softmax(np.dot(X,self.param.T),axis=1),axis = 1)
        
        return y_pred
    
    def accuracy(self,pred,actual):
        n_correct_preds = 0
        for i in range(actual.shape[0]):
            if pred[i] == actual[i]:
                n_correct_preds += 1
        accuracy = n_correct_preds/actual.shape[0]
        return accuracy

In [4]:
class KNearestNeighbor():
    
    def __init__(self,x_train,y_train,K):
        self.k = K
        self.X = x_train
        self.y = y_train
    
    def get_Euclidean_distance(self,x):
        
        return np.sqrt(np.sum((self.X - x)**2,axis=1))
        
    def predict(self,x_test):
        
        pred = np.zeros(x_test.shape[0])
        for i in range(x_test.shape[0]):
            dist = self.get_Euclidean_distance(x_test[i])
            nearest_neighbors = dist.argsort()[0:self.k]
            unique,counts = np.unique(self.y[nearest_neighbors], return_counts = True)
            pred[i] = unique[np.argmax(counts)]
            
        return pred
    
    def accuracy(self,pred,actual):
        n_correct_preds = 0
        for i in range(actual.shape[0]):
            if pred[i] == actual[i]:
                n_correct_preds += 1
        accuracy = n_correct_preds/actual.shape[0]
        return accuracy

In [5]:
class Multi_LDA():

    def __init__(self,n_features,n_classes):
        self.n_features = n_features
        self.n_classes = n_classes
        self.S_w = np.zeros((n_features,n_features))
        self.S_b = np.zeros((n_features,n_features))
        self.mu = np.zeros(n_features)
        self.mu_class = np.zeros((self.n_classes,self.n_features))

    def fit(self,X,counts):
        
        self.mu = np.mean(X,axis = 0)
        for i in range(self.n_classes):
            if i==0:
                st_idx = 0
            else:
                st_idx = np.sum(counts[0:i])
            curr_X = X[st_idx:st_idx+counts[i]]
            self.mu_class[i] = np.mean(curr_X,axis = 0)
            self.S_w = self.S_w + np.dot((curr_X - self.mu_class[i]).T,curr_X - self.mu_class[i])
            self.S_b = self.S_b + counts[i]*np.dot((self.mu - self.mu_class[i]).T,self.mu - self.mu_class[i])
     
    def predict(self,x_test,n_components):
        
        V = np.dot(np.linalg.inv(self.S_w),self.S_b)
        eigenvalues, eigenvectors = np.linalg.eig(V)
        eigenvectors = eigenvectors.T
        eigenList = [(eigenvalues[i],eigenvectors[i,:]) for i in range(len(eigenvalues))]
        eigenList = sorted(eigenList,key = lambda x:x[0] ,reverse= True)
        for i in range(n_components):
            eigenvectors[i,:] = eigenList[i][1]
        # Project the data onto eigenvectors
        eigenvectors = eigenvectors[0:n_components,:]
        projected_X = np.dot(x_test,eigenvectors.T)
        print(eigenvectors.shape,projected_X.shape)
        projected_mu = np.dot(self.mu_class,eigenvectors.T)
        pred = np.zeros(x_test.shape[0])
        for i in range(x_test.shape[0]):
            best_class = 0
            best_dist = math.inf
            for j in range(self.n_classes):
                if np.linalg.norm(projected_X[i] - projected_mu[j]) < best_dist:
                    best_dist = np.linalg.norm(projected_X[i] - projected_mu[j])
                    best_class = j
            pred[i] = best_class
        
        return pred
    
    def accuracy(self,pred,actual):
        n_correct_preds = 0
        for i in range(actual.shape[0]):
            if pred[i] == actual[i]:
                n_correct_preds += 1
        accuracy = n_correct_preds/actual.shape[0]
        return accuracy
    

In [15]:
class GaussianMLE():
    
    def __init__(self,n_classes):
        self.n_classes = n_classes
        self.eps = 1e-7
    
    def fit(self,X,counts):
        self.mu = np.zeros((self.n_classes,X.shape[1]))
        self.cov = np.zeros((self.n_classes,X.shape[1],X.shape[1]))
        
        for i in range(len(counts)):
            if i==0:
                st_idx = 0
            else:
                st_idx = np.sum(counts[0:i])
            curr_X = X[st_idx:st_idx+counts[i]]
            self.mu[i] = np.mean(curr_X,axis = 0)
            self.cov[i] = np.dot((curr_X-self.mu[i]).T,curr_X-self.mu[i])
            
    def log_likelihood(self,X, mu, cov):
        
        sign,log_det = np.linalg.slogdet(cov)

        return 0.5*log_det -0.5*np.dot(np.dot((X-mu).T,np.linalg.inv(cov)),X-mu)

    def predict(self,x_test):
        
        pred = np.zeros(x_test.shape[0])
        for i in range(x_test.shape[0]):
            best_class = 0
            best_likelihood = -math.inf
            for j in range(self.n_classes):
                log_likelihood = self.log_likelihood(x_test[i],self.mu[j],self.cov[j])
                if best_likelihood < log_likelihood:
                    best_likelihood = log_likelihood
                    best_class = j
            pred[i] = best_class
        return pred
    
    def accuracy(self,pred,actual):
        n_correct_preds = 0
        for i in range(actual.shape[0]):
            if pred[i] == actual[i]:
                n_correct_preds += 1
        accuracy = n_correct_preds/actual.shape[0]
        return accuracy

In [16]:
class DataLoader():
    
    def __init__(self,data_path):
        self.data = np.load(data_path)
    
    def train_test_split(self):
        
        x_train = self.data['train_images'].reshape(self.data['train_images'].shape[0],-1)
        x_test = self.data['test_images'].reshape(self.data['test_images'].shape[0],-1)
        x_val = self.data['val_images'].reshape(self.data['val_images'].shape[0],-1)
        y_train = self.data['train_labels']
        y_test = self.data['test_labels']
        y_val = self.data['val_labels']
    
        return x_train,y_train,x_test,y_test,x_val,y_val

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

if __name__ == '__main__':

    dataset = DataLoader('../Assignment 1/data/bloodmnist.npz')
    x_train,y_train,x_test,y_test,x_val,y_val = dataset.train_test_split()
    unique, counts = np.unique(y_train, return_counts=True)
    X = np.zeros(x_train.shape)
    y = np.zeros(y_train.shape[0],dtype = int)
    idx = 0
    for label in range(8):
        labels = np.where(y_train==label)[0]
        for i in range(np.size(labels)):
            X[idx] = x_train[labels[i]]
            y[idx] = label
            idx += 1
    
    '''
    clf = GaussianNB()
    clf.fit(X,y)
    print(f'accuracy of Sklearn Gaussian Naive Bayes = {clf.score(x_test,y_test)}')
    
    clf = LinearDiscriminantAnalysis()
    clf.fit(X,y)
    print(f'accuracy of Sklearn LDA = {clf.score(x_test,y_test)}')
    
    clf = LogisticRegression()
    clf.fit(X,y)
    print(f'accuracy of Sklearn Logisitic regression = {clf.score(x_test,y_test)}')
    '''
    
    clf = GaussianMLE(n_classes=8)
    clf.fit(X,counts)
    pred = clf.predict(x_test)
    accuracy = clf.accuracy(pred,y_test)
    print(f'accuracy of my Gaussian MLE = {accuracy}',pred)
    
    clf =  Multiclass_Logistic_Regression(learning_rate = 0.001, n_classes=8)
    clf.fit(X,y)
    pred = clf.predict(x_test)
    accuracy = clf.accuracy(pred,y_test)
    print(f'accuracy of my MultiClass Logistic Regression = {accuracy}',pred) 
    
    
    clf = Multiclass_Gaussian_Naive_Bayes(x_train.shape[1],8)
    clf.fit(X,counts)
    pred = clf.predict(x_test)
    accuracy = clf.accuracy(pred,y_test)
    print(f'accuracy of my MultiClass Gaussian Naive Bayes = {accuracy}',pred) 
    
    
    clf = Multi_LDA(n_classes=8,n_features = x_train.shape[1])
    clf.fit(X,counts)
    pred = clf.predict(x_test,n_components = 7)
    accuracy = clf.accuracy(pred,y_test)
    print(f'accuracy of my LDA with n_components = 7  is {accuracy}',pred)
    
    clf = KNearestNeighbor(X,y,1)
    pred = clf.predict(x_test)
    accuracy = clf.accuracy(pred,y_test)
    print(f'accuracy of K nearest neighbor = {accuracy}')
    