In [1]:
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
import numpy as np


In [2]:
def load_file(fileName):
    dataset = pd.read_table(fileName, header=0, sep=",", encoding="unicode_escape")
    
    return dataset

In [4]:
# preprocess creates the term frequency matrix for the review data set
def preprocess(data):
    count_vectorizer = CountVectorizer()
    data = count_vectorizer.fit_transform(data)
    #tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data)

    return data

In [12]:
def learn_model(data,target):
  
    classifier = None
    #Your custom implementation of NaiveBayes classifier will go here.
    
    # count vector matrix
    count_vector = data.toarray()
    # N, |V|
    Total_documents, Vocabulary_size = len(count_vector), len(count_vector[0])
    # print(Total_documents, Vocabulary_size)
    
    # get all individual classes
    classes = list(np.unique(target))
    # P(c_i)
    Prob_Classes = np.zeros(len(classes))

    # P(w_i/c_j) - P(each_word / given a class) 
    Prob_wi_by_cj = np.zeros((len(classes),Vocabulary_size))

    #
    for each_class in range(len(classes)):
        # count of documents that have been mapped to this category c_j
        docs_with_c_j = []
        for doc in range(len(count_vector)):
            if(target.iloc[doc] == classes[each_class]):
                docs_with_c_j.append(count_vector[doc])
        # Prob(c) of given class / P(c_j)
        Prob_Classes[each_class] = len(docs_with_c_j) / Total_documents

        # calculate the total count of each word
        #  this is docs_with_c_j below
        # doc  | word1 word2 word3 ... | class
        # doc1 |   1    0     1    ... |  c_j
        # doc2 |   1    0     0    ... |  c_j
        # doc3 |   0    1     1    ... |  c_j
        # doc4 |   1    1     0    ... |  c_j
        #  ...                     ...
        # = [sum(word1), sum(word2), sum(word3), ... vocab_size]
        count_wi_by_cj = np.sum(docs_with_c_j, axis=0) # Count(w_i, c_j) for all w_i
        count_wi_by_cj += 1 # laplace smoothing
        Sum_Prob_wi_by_cj = np.sum(count_wi_by_cj) # summation(count(w, c_j))
        Prob_wi_by_cj[each_class] = count_wi_by_cj / Sum_Prob_wi_by_cj # P(w_i | c_j)

    Class_Prob_Dict = dict()
    for each_class in range(len(classes)):
        Class_Prob_Dict[classes[each_class]] = Prob_Classes[each_class]
    
    classifier = (classes, Class_Prob_Dict, Prob_wi_by_cj) # c's, P(c), P(w_i|c_j)

    return classifier

In [13]:
def classify(classifier, testdata):
    
    predicted_val=[]
    #Your code to classify test data using the learned model will go here
  
    return predicted_val

In [14]:
def evaluate(actual_class, predicted_class):
        
    accuracy = -1    
    #Your code to evaluate the model will go here. The code will print overall model's accuracy and precision 
    #and recall for each class label.
    
    print("The accuracy score is :",accuracy)
    

In [15]:
features = ["SUMMARY", "categories", "sub_categories"]

print("Loading data.....")
dataset = load_file("TextClassification_Data.csv")
data,target = dataset[features[0]].fillna(" "), dataset[features[1]]

print("preprocessing data.....")
word_vectors = preprocess(data)

trainingX,testX,trainingY,testY = train_test_split(word_vectors,target,test_size=0.4,random_state=43)

print("Learning model.....")
model = learn_model(trainingX,trainingY)

print("Classifying test data......")      
predictedY = classify(model, testX)

print("Evaluating results.....")
evaluate(testY,predictedY)

# print(metrics.accuracy_score(testY,predictedY))
# print(metrics.recall_score(predictedY, testY,average = 'micro'))
# print(metrics.precision_score(predictedY, testY,average = 'micro'))
# print(metrics.f1_score(predictedY,testY,average = 'micro'))

Loading data.....
preprocessing data.....
Learning model.....
Classifying test data......
Evaluating results.....
The accuracy score is : -1


In [None]:
# P(ci) = [Num documents that have been classified as ci] / [Num documents]
# where ci = a calss in label column

# for P(a specific word / class j)
# P ( wi | cj ) = [ count( wi, cj ) + 1 ] / [ Σw∈V( count ( w, cj ) ) + |V| ]
# The probability of word i given class j is the count that the word occurred in documents of class j,
# divided by the sum of the counts of each word in our vocabulary in class j refers the above probability.
# So for the denominator, we iterate through each word in our vocabulary,
# look up the frequency that it has occurred in class j, and add these up.

