In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install hazm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from __future__ import unicode_literals
from hazm import *
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import math
from operator import itemgetter

In [4]:
test_filename = '/content/drive/My Drive/AI/books_test.csv'
test_data = pd.read_csv(test_filename)
train_filename = '/content/drive/My Drive/AI/books_train.csv'
train_data = pd.read_csv(train_filename)
train_data.describe()

Unnamed: 0,title,description,categories
count,2550,2550,2550
unique,2502,2535,6
top,اثر مرکب,هر انقلابی بر پایه‌ای ایستاده و رو به مقصدی دا...,جامعه‌شناسی
freq,3,3,425


In [5]:
titles=[word.lower() for word in test_data]
titles

['title', 'description', 'categories']

In [6]:
normalizer = Normalizer()
stemmer = Stemmer()
lemmatizer = Lemmatizer()
stopwords = stopwords_list()
stopwords.extend( ['.',',','«','»','(',')','،','[',']','{','}','-','','؛', '\n','\r'])

In [7]:

#remove stop words
words_class_dict = {} # { word : [number of first class, 2, 3, 4, 5,6]}
des = 'description'
cat = 'categories'
all_categories = []
cat_words_dict = {} # { category1:{word1:number, word2: 5, word3: 2, ..} , category2:..  }
for index,row in train_data.iterrows():
    normal_str = normalizer.normalize(row['title']+row[des])
    text = word_tokenize(normal_str)
    # text = (normal_str).split(" ")
    for word in text:
        word = stemmer.stem(word)
        word = lemmatizer.lemmatize(word)
        if word not in stopwords:
            temp_list = [1,1,1,1,1,1]
            # temp_list = [0,0,0,0,0,0]
            if train_data.at[index,cat] in all_categories:
                for category in range(len(all_categories)):
                    if train_data.at[index,cat] == all_categories[category]:
                        if category in cat_words_dict.keys():
                            if word not in cat_words_dict[category].keys():
                                cat_words_dict[category][word] = 1
                            else:
                                cat_words_dict[category][word] = cat_words_dict[category][word]+1
                        else:
                            cat_words_dict[category] = {word:1}
                        if word in words_class_dict.keys():
                            temp_list = words_class_dict[word]
                        temp_list[category] = temp_list[category]+1
                        words_class_dict[word] = temp_list
            elif train_data.at[index,cat] not in all_categories:
                category = train_data.at[index,cat]
                if category in cat_words_dict.keys():
                    if word not in cat_words_dict[category].keys():
                        cat_words_dict[category][word] = 1
                    else:
                        cat_words_dict[category][word] = cat_words_dict[category][word]+1
                else:
                    cat_words_dict[category] = {word:1}
                    # cat_words_dict[category].update(word)
                all_categories.append(train_data.at[index,cat])
                if word in words_class_dict.keys():
                    temp_list = words_class_dict[word]
                temp_list[len(all_categories)-1] = temp_list[len(all_categories)-1]+1
                words_class_dict[word] = temp_list


In [8]:
word_category_prob = {}
for word in words_class_dict.keys():
    temp_list = words_class_dict[word]
    count = 0
    for i in range(6):
        count += temp_list[i]
    prob = [0,0,0,0,0,0]
    for i in range(6):
        prob[i] = temp_list[i]/count
    word_category_prob[word] = prob

In [9]:
# P(Class1 | text) = P(word1| class1) x P(word2| class1)x ... xP(class1)
text_category_prob = {}
for index,row in test_data.iterrows():
    normal_str = normalizer.normalize(row['title']+row[des])
    text = (normal_str).split(" ")
    text_class_prob = [0,0,0,0,0,0]
    for class_num in range(6):
        cond_prob = 1
        for word in text:
            if word not in stopwords:
                if word[len(word)-1] in stopwords:
                    word = word[:len(word)-2]
                elif word[0] in stopwords:
                    word = word[1:]
                if word in word_category_prob.keys():
                    temp_word_prob = word_category_prob[word][class_num]
                    if temp_word_prob >0:
                        cond_prob = cond_prob + math.log(temp_word_prob)
        text_class_prob[class_num] = cond_prob + math.log(1/6)
    text_category_prob[index] = text_class_prob

In [10]:
text_category_dict = {}
num_of_detections = [0,0,0,0,0,0]
for t in text_category_prob.keys():
    temp_list = text_category_prob[t]
    final_category = ""
    max_prob = -1000
    for i in range(6):
        if max_prob < temp_list[i]:
            text_category_dict[t] = all_categories[i]
            max_prob = temp_list[i]
for t in text_category_dict.keys():
    for i in range(6):
        if text_category_dict[t] == all_categories[i]:
            num_of_detections[i] = num_of_detections[i]+1        
print(num_of_detections)
print(text_category_dict)   

[96, 55, 23, 33, 68, 175]
{0: 'رمان', 1: 'داستان کوتاه', 2: 'کلیات اسلام', 3: 'رمان', 4: 'داستان کوتاه', 5: 'جامعه\u200cشناسی', 6: 'کلیات اسلام', 7: 'رمان', 8: 'جامعه\u200cشناسی', 9: 'کلیات اسلام', 10: 'مدیریت و کسب و کار', 11: 'مدیریت و کسب و کار', 12: 'رمان', 13: 'رمان', 14: 'رمان', 15: 'رمان', 16: 'رمان', 17: 'مدیریت و کسب و کار', 18: 'رمان', 19: 'مدیریت و کسب و کار', 20: 'رمان', 21: 'مدیریت و کسب و کار', 22: 'مدیریت و کسب و کار', 23: 'جامعه\u200cشناسی', 24: 'داستان کودک و نوجوانان', 25: 'رمان', 26: 'داستان کودک و نوجوانان', 27: 'رمان', 28: 'داستان کودک و نوجوانان', 29: 'رمان', 30: 'مدیریت و کسب و کار', 31: 'رمان', 32: 'کلیات اسلام', 33: 'رمان', 34: 'جامعه\u200cشناسی', 35: 'کلیات اسلام', 36: 'مدیریت و کسب و کار', 37: 'داستان کوتاه', 38: 'جامعه\u200cشناسی', 39: 'مدیریت و کسب و کار', 40: 'رمان', 41: 'رمان', 42: 'رمان', 43: 'جامعه\u200cشناسی', 44: 'مدیریت و کسب و کار', 45: 'رمان', 46: 'رمان', 47: 'رمان', 48: 'رمان', 49: 'مدیریت و کسب و کار', 50: 'جامعه\u200cشناسی', 51: 'جامعه\u200cشناس

In [11]:
correct_detections = [0,0,0,0,0,0]
num_of_real = [0,0,0,0,0,0]
accuracy = 0
correct = 0
total = 0
wrong_category = []
# print(text_category_dict)
for index,row in test_data.iterrows():
    total += 1
    if index in text_category_dict.keys():
        if row[cat] == text_category_dict[index]:
            correct += 1
        else:
            wrong_category.append(row['title'])
        for i in range(6):
            if row[cat] == text_category_dict[index] and row[cat] == all_categories[i]:
                correct_detections[i] = correct_detections[i]+1
    for i in range(6):
        if row[cat] == all_categories[i]:
            num_of_real[i] = num_of_real[i]+1

In [12]:
accuracy = (correct/total)*100
precision = [0,0,0,0,0,0] #for each category
recall = [0,0,0,0,0,0]
F1 = [0,0,0,0,0,0]

for i in range(6):
    if num_of_detections[i] != 0:
        precision[i] = ((correct_detections[i])/num_of_detections[i])*100
    if num_of_real[i] != 0:
        recall[i] = ((correct_detections[i])/num_of_real[i])*100
    if precision[i] !=0 and recall[i]!= 0:
        F1[i] = 2*((recall[i]*precision[i])/(recall[i]+precision[i]))

macro_F1 = 0
for i in range(6):
    macro_F1 += F1[i]
macro_F1 = macro_F1/6

print(accuracy)
print(precision)
print(recall)

66.44444444444444
[68.75, 89.0909090909091, 95.65217391304348, 69.6969696969697, 95.58823529411765, 42.285714285714285]
[88.0, 65.33333333333333, 29.333333333333332, 30.666666666666664, 86.66666666666667, 98.66666666666667]
