In [None]:
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

In [None]:
def load_file(fileName):
    dataset = pd.read_table(fileName, header=0, sep=",", encoding="unicode_escape")
    return dataset

In [None]:
# preprocess creates the term frequency matrix for the review data set
def preprocess(data):
    count_vectorizer = CountVectorizer()
    data = count_vectorizer.fit_transform(data)
    return data

In [None]:
class NaiveBayesClassifier():
    
    def __init__(self):
        self.labels = None
        self.array_prob = None
        self.probability_Y = dict()
        
        
    def calculate_probability(self,count,possible_values,total,laplacian = True):
        if laplacian:
            return (1+count)/(possible_values+total)
        return count/total
    
    
    def learn(self,data,target):
        """
        Find the probability of the categories.
        For that find the count of each category and total count of all categories.
        """
        self.labels = target.value_counts().index
        each_label_count = target.value_counts().array 
        total_label_count = sum(each_label_count) #denominator of P(Yi)

        for i in range(len(self.labels)): #storing label probabilities in dict
            self.probability_Y[self.labels[i]] = self.calculate_probability(each_label_count[i],len(each_label_count),total_label_count)
        """
        Make 3D array of probabilities where number of columns are marked by input variable and rows are marked by categories
        example: [[[X0,Y0],[X1,Y0]] | [[S0,Y0],[S1,Y0]]  | [[R0,Y0],[R1,Y0]]]
                 [[[X0,Y1],[X1,Y1]] | [[S0,Y1],[S1,Y1]]  | [[R0,Y1],[R1,Y1]]]
        """
        data_matrix = data.toarray()
        count_col = len(data_matrix[0])
        count_row = len(self.labels) 
        self.array_prob = np.zeros((count_row,count_col,2)) #3D array of 0 for probabilities
        counter_label = 0

        for i in self.labels:
            counter_column = 0
            matched_Y_ind = np.where(target.array == i)[0] #rows where output label is macthed
            count_matched_Y_ind = len(matched_Y_ind) #denominator of P(Xi)
            self.array_prob[counter_label,:,:] = self.calculate_probability(0, count_row,count_matched_Y_ind)
            for j in data_matrix.T: #by transpose we can access the cloumns directly
                uniqueValues, occurCount = np.unique(j[matched_Y_ind], return_counts=True)
                listOfUniqueValues = zip(uniqueValues, occurCount)
                for elem in listOfUniqueValues:
                    if (elem[0]==1 or elem[0]==0): #ignore for values greater than 1
                        prob = self.calculate_probability(elem[1],2,count_matched_Y_ind)
                        self.array_prob[counter_label,counter_column,elem[0]] = prob
                counter_column += 1
            counter_label += 1
        return self.array_prob
            
        
    def predict(self,testdata):
        testdata_matrix = testdata.toarray()
        predictions = np.zeros((len(self.labels),len(testdata_matrix))) #key = label, counter_row
        self.predicted_label = []
        
        counter_label = 0
        for i in self.labels: #row value
            counter_row = 0
            prob_Y = self.probability_Y[i]
            for j in testdata_matrix: 
                counter_col = 0
                list_prob = []
                prob_XgY = prob_Y
                for k in j: #col
                    if k<=1:
                        prob_XgY = self.array_prob[counter_label,counter_col,k] * prob_XgY
                    counter_col += 1
                predictions[counter_label,counter_row] =prob_XgY
                counter_row += 1
            counter_label += 1
        for i in predictions.T:
            max_index = result = np.where(i == np.amax(i))[0][0] #tuple of max_ind and value
            self.predicted_label.append(self.labels[max_index])
        return self.predicted_label 

In [None]:
def learn_model(data,target): #trainx, trainY (trainX = .toarray() & trainY = .array)
  
    classifier = NaiveBayesClassifier()
    classifier.learn(data,target)
    return classifier

In [None]:
def classify(classifier, testdata):
    
    predicted_val= classifier.predict(testdata)
    return predicted_val

In [None]:
def confusionMatrix(actual_class,predicted):
        actual = actual_class.array
        labels = np.unique(actual)
        result = np.zeros((len(labels), len(labels)))
        for i in range(len(actual)):
            a = np.where(labels == actual[i])[0]
            p = np.where(labels == predicted[i])[0]
            result[a,p] += 1
        return result
    
def precision(label, cm):
    col = cm[:, label] #col of confusion matrix. 
    return cm[label, label] / col.sum() #true posiitve/ sum of true positive + false-positive

def precision_all(cm):
    i = len(cm)
    p_sum = 0
    for label in range(i):
        p_sum += precision(label, cm)
    return p_sum/i

def recall(label, cm): 
    row = cm[label, :] #row of confusion matrix
    return cm[label, label] / row.sum() #true posiitve/ sum of true positive + false-negative

def recall_all(cm):
    i = len(cm)
    p_sum = 0
    for label in range(i):
        p_sum += recall(label, cm)
    return p_sum/i

def evaluate(actual_class, predicted_class):
    cm = confusionMatrix(testY,predictedY)
    #print('cm',cm)
    accuracy = 0 
    p = precision_all(cm)
    r = recall_all(cm)
    f_measure = (2*p*r)/(p+r)
    print("The confusion matrix is: \n",cm)
    print("The precision score is :",p)
    print("The recall score is :",r)
    print("The f-measure score is :",f_measure)

In [None]:
features = ["SUMMARY", "categories", "sub_categories"]

print("Loading data.....")
dataset = load_file("TextClassification_half.csv")
data,target = dataset[features[0]].fillna(" "), dataset[features[1]]

print("preprocessing data.....")
word_vectors = preprocess(data)
    
trainingX,testX,trainingY,testY = train_test_split(word_vectors,target,test_size=0.25,random_state=43)

print("Learning model.....")
model = learn_model(trainingX,trainingY)

In [None]:
print("Classifying test data......")   

predictedY = classify(model, testX)

In [None]:
print("Evaluating results.....")
evaluate(testY,predictedY)