In [1]:
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
# This function loads the file into a dataframe using pandas.
def load_file(fileName):
    dataset = pd.read_table(fileName, header=0, sep=",", encoding="unicode_escape")
    return dataset

In [3]:
# preprocess creates the term frequency matrix for the review data set
def preprocess(data):
    count_vectorizer = CountVectorizer()
    data = count_vectorizer.fit_transform(data)
    #tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data)
    return data

In [4]:
# This function learns the model by finding class probabilities and conditional probabilities of each word
# given a particular class. The classifier with both these attributes as well as the classes list is returned. 
def learn_model(data,target):
  
    classifier = dict() # Classifier to be returned

    # Create two arrays - one for storing classes and other for storing corresponding frequencies of each class in data
    classes, class_frequency = np.unique(target, return_counts = True)

    # Convert data to array
    corpus = data.toarray()

    # Initialize structures to store class probabilities and conditional probabilities for Naive Bayes classifier
    class_probs = dict()
    conditional_probs = np.zeros((len(classes), corpus.shape[1]), dtype = float)

    # Loop runs for each class
    for i in range(len(classes)):

      # Finding probability of each class in the given dataset using frequencies
      class_probs[classes[i]] = (class_frequency[i] / np.sum(class_frequency))
      
      # Finding all docs in the data for the particular class in this iteration of loop
      docs_in_class = np.where(target == classes[i])
      corpus_for_class = (np.take(corpus, axis = 0, indices=docs_in_class))[0]

      # Finding conditional probabilites for each word given this particular class after Laplace Smoothing
      conditional_probs[i] = (np.sum(corpus_for_class, axis = 0) + 1) # the + 1 incorporates for Laplace smoothing
      conditional_probs[i] /= np.sum(conditional_probs[i])
 
    # The classifier is set up before returning  
    classifier["Conditional Probabilites"] = conditional_probs
    classifier["Class Probabilites"] = class_probs
    classifier["Classes"] = classes
 
    return classifier


In [5]:
# This function takes in the classifier and uses that to predict the class of the testdata
def classify(classifier, testdata):
    
    # The array to return from this functions
    predicted_val=[]

    # Convert testdata to array
    corpus = testdata.toarray()

    # Dereferencing data from the classifier
    conditional_probabilities = classifier["Conditional Probabilites"]
    class_probs = classifier["Class Probabilites"]
    classes = classifier["Classes"]

    # This loop runs for each doc in testdata
    for i in range(corpus.shape[0]):

      # Finding words that exist in this particular doc
      words_in_doc = np.where(corpus[i] > 0)

      # Initializing array of predicted probabilities for each class this doc could belong to
      predicted_probs = np.zeros(6, dtype = float)

      # This loop runs for each possible class the doc could belong to
      for j in range(len(classes)):

        # We choose the conditional probabilities given this particular class for the words in this particular doc
        chosen_probs = (np.take(conditional_probabilities[j], indices=words_in_doc))[0]

        # We multiply all the conditional probabilities and the probability of this particular class
        predicted_probs[j] = np.prod(chosen_probs) * class_probs[classes[j]]
      
      # Finding the maximum likelihood of the class that this particular doc belongs to, choosing the max one to append 
      max_index = np.where(predicted_probs == np.amax(predicted_probs))
      predicted_val.append(classes[max_index][0])

    return predicted_val

In [8]:
# This function evaluates the metrics of our learned model
def evaluate(actual_class, predicted_class):
    
    # Generate the confusion matrix and get its dimension
    conf_mat = confusion_matrix(actual_class, predicted_class)
    dim = conf_mat.shape[0]

    # Initializing True-Positive, True-Negative, False-Positive, False-Negative. 
    # These metrics will be found in arrays corresponding to each class label. 
    TP = []
    TN = []
    FP = []
    FN = []

    # Calculating TP from confusion matrix -> TP of each class = its respective diagonal 
    for i in range(dim):
      for j in range(dim):
        if(i == j):
          TP.append(conf_mat[i][j])

    # Calculating the accuracy of the model as a whole
    accuracy = np.sum(TP) / len(actual_class)
    

    # Calculating TN from confusion matrix. This is different for each class label as negative would mean predicting
    # any class, other than the one in question, correctly. 
    for i in range(dim):
      total = 0
      for j in range(dim):
        for k in range(dim):
          if(j != i and k != i):
            total += conf_mat[j][k]
      TN.append(total)

    
    # Calculating FP from confusion matrix. This is different for each class label due to similar reasons as TN.
    for i in range(dim):
      total = 0
      for j in range(dim):
        if(j != i):
          total += conf_mat[i][j]

      FP.append(total)
        
    # Calculating FN from confusion matrix. This is different for each class label due to similar reasons as TN.    
    for j in range(dim):
      total = 0
      for i in range(dim):
        if(j != i):
          total += conf_mat[i][j]

      FN.append(total)


    # Initializing arrays. These metrics would be different for each class label due to difference in TN, FP, FN.

    precision = [] # TP / (TP + FP)
    recall = [] # TP / (TP + FN)
    f_measure = [] # 2TP / (2TP + FP + FN)

    
    for i in range(dim):
      if(TP[i] != 0):
        precision.append(TP[i] / (TP[i] + FP[i]))
        recall.append(TP[i] / (TP[i] + FN[i]))
        f_measure.append((2* TP[i]) / ((2 * TP[i]) + FP[i] + FN[i]))

      else:
        precision.append(0)
        recall.append(0)
        f_measure.append(0)
    
    print("The accuracy score for the overall model is :",accuracy)
    print(f"The accuracy percentage for the overall model is: {accuracy*100} %")
    print("The precision score for each label is :",precision)
    print("The recall score for each label is :",recall)
    print("The F measure score for each label is :",f_measure)


In [9]:
features = ["SUMMARY", "categories", "sub_categories"]

print("Loading data.....")
dataset = load_file("TextClassification_Data.csv")

dataset = dataset.dropna() #added

data,target = dataset[features[0]].fillna(" "), dataset[features[1]]

print("preprocessing data.....")
word_vectors = preprocess(data)

trainingX,testX,trainingY,testY = train_test_split(word_vectors,target,test_size=0.4,random_state=43)

print("Learning model.....")
model = learn_model(trainingX,trainingY)

print("Classifying test data......")      
predictedY = classify(model, testX)

print("Evaluating results.....")
evaluate(testY,predictedY)

Loading data.....
preprocessing data.....
Learning model.....
Classifying test data......
Evaluating results.....
The accuracy score for the overall model is : 0.7341707611013256
The accuracy percentage for the overall model is: 73.41707611013256 %
The precision score for each label is : [0.795196671709531, 0.6357758620689655, 0, 0.6395759717314488, 0.7037481979817396, 0.8080650744202146]
The recall score for each label is : [0.8133462282398453, 0.6758304696449027, 0, 0.7288590604026846, 0.6130179991628296, 0.8090452261306532]
The F measure score for each label is : [0.8041690571811054, 0.6551915602443087, 0, 0.6813048933500627, 0.6552572706935123, 0.8085548532340462]
