In [1]:
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
import numpy as np


In [2]:
def load_file(fileName):
    dataset = pd.read_table(fileName, header=0, sep=",", encoding="unicode_escape")
    return dataset

In [3]:
# preprocess creates the term frequency matrix for the review data set
def preprocess(data):
    count_vectorizer = CountVectorizer()
    data = count_vectorizer.fit_transform(data)
    #tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data)

    return data

In [4]:
def learn_model(data,target):
  
    classifier = None
    #Your custom implementation of NaiveBayes classifier will go here.
    
    # count vector matrix
    count_vector = data.toarray()
    # N, |V|
    Total_documents, Vocabulary_size = len(count_vector), len(count_vector[0])
    
    # get all individual classes
    classes = list(np.unique(target))
    # P(c_i)
    Prob_Classes = np.zeros(len(classes))

    # P(w_i/c_j) - P(each_word / given a class) 
    Prob_wi_by_cj = np.zeros((len(classes),Vocabulary_size))

    #
    for each_class in range(len(classes)):
        # count of documents that have been mapped to this category c_j
        docs_with_c_j = []
        for doc in range(len(count_vector)):
            if(target.iloc[doc] == classes[each_class]):
                docs_with_c_j.append(count_vector[doc])
        # Prob(c) of given class - P(c_j)
        Prob_Classes[each_class] = len(docs_with_c_j) / Total_documents

        # calculate the total count of each word
        count_wi_by_cj = np.sum(docs_with_c_j, axis=0) # Count(w_i, c_j) for all w_i
        count_wi_by_cj += 1 # laplace smoothing
        Sum_Prob_wi_by_cj = np.sum(count_wi_by_cj) # summation(count(w, c_j))
        Prob_wi_by_cj[each_class] = count_wi_by_cj / Sum_Prob_wi_by_cj # P(w_i | c_j)

    Class_Prob_Dict = dict()
    for each_class in range(len(classes)):
        Class_Prob_Dict[classes[each_class]] = Prob_Classes[each_class]
    
    classifier = (classes, Class_Prob_Dict, Prob_wi_by_cj) # c's, P(c), P(w_i|c_j)

    return classifier

In [5]:
def classify(classifier, testdata):
    
    predicted_val=[]
    #Your code to classify test data using the learned model will go here
    classes, Class_Prob_Dict, Prob_wi_by_cj=classifier
    test=testdata.toarray()
    for row in range(len(test)): #for each doc
        prob=[0]*len(classes) #probability of each class for one doc
        for c in range(len(classes)):
            #P(c|d)=P(c)*P(w|c)
            prob[c]=Class_Prob_Dict[classes[c]]*Prob_wi_by_cj[c][test[row].astype(bool)].prod()
        max_index=0
        # adding the class with highest probability
        max_prob=prob[0]
        for p in range(len(prob)):
            if prob[p]>max_prob:
                max_index = p 
                max_prob=prob[p]
        predicted_val.append(classes[max_index])

    return predicted_val

In [6]:
def evaluate(actual_class, predicted_class):
        
    accuracy = -1    
    #Your code to evaluate the model will go here. The code will print overall model's accuracy and precision 
    #and recall for each class label.
    classes = list(np.unique(actual_class))
    confusion_matrix = np.zeros((len(classes), len(classes)))
    
    # converting into dataframe
    predicted_class = pd.Series(predicted_class)
    
    # confusion matrix
    for actual in range(len(classes)):
        for predicted in range(len(classes)):
            for c in range(len(actual_class)):
                if(actual_class.iloc[c] == classes[actual] and predicted_class.iloc[c] == classes[predicted]):
                    confusion_matrix[actual][predicted] += 1

    # accuracy
    total = 0
    diagonal = 0
    for i in range(len(confusion_matrix)):
        for j in range(len(confusion_matrix)):
            if(i == j):
                diagonal += confusion_matrix[i][j]
            total += confusion_matrix[i][j]
    accuracy = diagonal / total
    
    # recall
    recall_lst = [0] * len(classes)
    for i in range(len(classes)):
        tp = confusion_matrix[i][i]
        denominator = sum(confusion_matrix[i])
        recall_lst[i] = tp / denominator

    # precision
    precision_lst = [0] * len(classes)
    for i in range(len(classes)):
        tp = confusion_matrix[i][i]
        denominator = 0
        for k in range(len(classes)):
            denominator += confusion_matrix[k][i]
        
        if(denominator == 0):
            precision_lst[i] = 0
        else:
            precision_lst[i] = tp / denominator

    # f-measure
    f_measure_lst = [0] * len(classes)
    for i in range(len(classes)):
        if(precision_lst[i] + recall_lst[i] == 0):
            f_measure_lst[i] = 0
        else:
            f_measure_lst[i] = (2*precision_lst[i]*recall_lst[i])/(precision_lst[i]+recall_lst[i])

    print(confusion_matrix)
    print("The accuracy score is :", accuracy)
    print("The recall score is :", recall_lst)
    print("The precision score is :", precision_lst)
    print("The f-mesaure is :", f_measure_lst)
    

In [7]:
features = ["SUMMARY", "categories", "sub_categories"]

print("Loading data.....")
dataset = load_file("TextClassification_Data.csv")
data,target = dataset[features[0]].fillna(" "), dataset[features[1]]

print("preprocessing data.....")
word_vectors = preprocess(data)

trainingX,testX,trainingY,testY = train_test_split(word_vectors,target,test_size=0.4,random_state=43)

print("Learning model.....")
model = learn_model(trainingX,trainingY)

print("Classifying test data......")      
predictedY = classify(model, testX)

print("Evaluating results.....")
evaluate(testY,predictedY)

Loading data.....
preprocessing data.....
Learning model.....
Classifying test data......
Evaluating results.....
[[4.196e+03 4.210e+02 0.000e+00 7.200e+01 4.780e+02 4.420e+02]
 [3.200e+02 2.912e+03 0.000e+00 1.000e+02 6.310e+02 6.780e+02]
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 9.000e+00 1.000e+00]
 [1.150e+02 1.350e+02 0.000e+00 1.085e+03 2.890e+02 1.050e+02]
 [3.430e+02 4.250e+02 0.000e+00 1.020e+02 3.044e+03 9.690e+02]
 [1.580e+02 3.940e+02 0.000e+00 1.050e+02 4.710e+02 4.912e+03]]
The accuracy score is : 0.7048271648044693
The recall score is : [0.7480834373328579, 0.6274509803921569, 0.0, 0.6275303643724697, 0.623387261929142, 0.8132450331125828]
The precision score is : [0.8176149649259548, 0.6792628878003266, 0, 0.7411202185792349, 0.6184477854530679, 0.6911495708456451]
The f-mesaure is : [0.7813052788380969, 0.6523297491039426, 0, 0.6796116504854369, 0.6209077001529832, 0.7472427169696508]
