In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from scipy.special import entr

#Making a Pandas Dataframe from csv
original_dataset = pd.read_csv("training.csv")

In [62]:
#Make a duplicate of the dataset
dup_dataset = original_dataset.copy()
#Making train/test data of the dup_dataset 80/20 
training_data, testing_data = train_test_split(dup_dataset, test_size = .2, random_state = 42)
#List of all possible classes
possible_classes = training_data[training_data.columns[-1]].unique()  
#total number of different words
no_words = training_data.shape[1] - 2
#likely matrix full of zero's
likely_mtrx = np.zeros((len(possible_classes), no_words))
beta_likely_mtrx =  np.zeros((len(possible_classes), no_words))


In [63]:
#function for reshaping the training data
def reshape_train(training_data):
    req_columns = [column for column in training_data.columns if column not in ['1', '14']]
    req_training_data = training_data[req_columns]
    
    return req_training_data

In [64]:
req_training_data= reshape_train(training_data)
req_training_data.shape

(400, 61188)

In [65]:
#function for finding the priors
def priors(training_data, possible_classes):
    prior_probs = {} #Priors for each of the possible classes
    for i in possible_classes:
        each_classdata = training_data[training_data['14'] == i]
        prior_probs[i] = len(each_classdata) / float( len(training_data))
    return prior_probs

In [66]:
#range of Beta values to choose from
beta_values = [.00001, .00005, .0001, .0005, .001, .005, .01, .05, .1, .5, 1]
# Beta variable for NB running on data.
current_beta = 0.01

In [74]:
#function for finding the log likely matrix 
def likelihoods(possible_classes, training_data, no_words, current_beta):
    req_training_data = reshape_train(training_data) #calling function
    req_training_np = req_training_data.values #df to np array
    #only considering the required matrix by taking required position values
    training_pos = np.nonzero(req_training_np)
        
    for pos in range(len(training_pos[0])): 
        try:
            pos_r = training_pos[0][pos]
            pos_c = training_pos[1][pos]
            value_at_pos = req_training_np[pos_r][pos_c]
            class_r = training_data.iloc[pos_r][-1]
            current = likely_mtrx[class_r-1 ][pos_c]
            likely_mtrx[class_r-1 ][pos_c] = current + value_at_pos
        except IndexError:
            print (training_pos[0][pos], training_pos[1][pos], training_data.iloc[pos_r][-1])
    
        
    for i in range(len(possible_classes)): #for each possible class the total words are calculated 
        class_wc = sum(likely_mtrx[i])
        for j in range(no_words):
            beta_likely = likely_mtrx[i][j]
            beta_likely += current_beta  #uses the current beta value
            beta_likely /= (class_wc + (no_words * current_beta))
            beta_likely_mtrx[i][j] = beta_likely #likelyhood matrix with beta
            
    log_likely_mtrx = np.log2(beta_likely_mtrx)
    #print(likely_mtrx[15])
    return log_likely_mtrx


In [72]:
def training_acc(testing_data, possible_classes, no_words, current_beta):
    prior_probs = priors(training_data, possible_classes)
    log_likely_mtrx = likelihoods(possible_classes, training_data, no_words, current_beta)
    correct = 0
    for row in range(len(testing_data)):
        value_actual = testing_data.iloc[row][-1]
        list_updated = testing_data.iloc[row][1:-1].tolist()
        c_results = []
        
        for i in range(len(possible_classes)):
            dot_product = np.log2(prior_probs[i+1]) + np.dot(list_updated, log_likely_mtrx[i])
            c_results.append(dot_product)
        predicted = int(np.argmax(c_results)) + 1
        if value_actual == predicted:
            correct += 1
    train_acc = float (correct) / len(testing_data)
    return train_acc  #Prints the training accuracies


In [76]:
training_accuracy = training_acc(testing_data, possible_classes, no_words, current_beta)


In [71]:
print (training_accuracy)

0.57


In [None]:
#read the original testing data
original_testing_data = pd.read_csv("testing.csv")

def testing_acc(original_testing_data, possible_classes, training_data, no_words, current_beta):
    result =[]
    result.append['id','class']
    prior_probs = priors(training_data, possible_classes)
    log_likely_mtrx = likelihoods(possible_classes, training_data, no_words, current_beta)
    for i in range(len(original_testing_data)):
        num_r = original_testing_data.iloc[i][0]
        list_updated = original_testing_data.iloc[i][1:].tolist()
        c_results = []
        for i in range(len(possible_classes)):
            dot_product = np.log2(prior_probs[i+1]) + np.dot(list_updated, log_likely_mtrx[i])
            c_results.append(dot_product)
    classified = int(np.argmax(c_results)) + 1
    result.append([i+12001, classified])
    return final_result
#%%   

In [None]:
final_result = testing_acc(original_testing_data, possible_classes)
# Saving the predicted values to a CSV file inorder to upload it to kaggle
output = pd.DataFrame(final_result)
output.to_csv("predicted_result_naive_bayes_mani.csv", header = None, index= None)

In [None]:
def rank(training_data, no_words, possible classes):
    length_words = 61188
    beta_value = 1/length_words

    alpha = 1+beta_value
    req_training_data = reshape_train(training_data)
    req_training_np = req_training_data.to_numpy()
    # Returns a sparse matrix with positions/index of non-zero elements
    training_pos = np.nonzero(req_training_np)
    likely_mtrx = np.zeros((len(possible_classes), no_words))

    for pos in range(len(training_pos)):
        try:
            pos_r = training_pos[0][pos]
            pos_c = training_pos[1][pos]
            value_at_pos = req_training_np[pos_r][pos_c]
            class_r = training_data.iloc[pos_r][-1]
            likely_mtrx[class_r -1 ][pos_c] += value_at_pos
        except:
            print (training_pos[0][pos], training_pos[1][pos], training_data.iloc[pos_r][-1])
        
    class_wc = np.sum(likely_mtrx, axis=1)
    
    probability_words = np.transpose(((np.transpose(likely_mtrx))+alpha-1)/(class_wc + ((alpha -1)* length_words)))

    # from the function entr scipy
    # It calculates entropy values from the probs of each words
    entr_values = entr(probability_words)

    sum_entr_values = np.sum(entr_values, axis =0)

    # top 100 ranked words indices
    rank_indices = sum_entr_values.argsort()[:100]


    #List of words from the text file
    vocab_list = np.loadtxt("vocabulary.txt", dtype="str")
    
    return vocab_list

In [None]:
vocabulary = rank(training_data, no_words, possible_classes) 

print(list(vocabulary[rank_indices]))