# IR Boolean Model

In [None]:
!pip install bs4
!pip install lxml
!pip install hazm

In [None]:
from os import listdir
from os.path import isfile, join
import math
import bs4
import pandas as pd
import hazm as hzm
import ast
normalizer = hzm.Normalizer()

In [None]:
mypath = 'dataset/Corpus/2007/'
dataset = [f for f in listdir(mypath) if isfile(join(mypath,f))]
dataset.remove('hamshahri.dtd')

In [None]:
dic = {}
for data in dataset:
    raw_file = open(join(mypath,data),'r')
    raw_file = raw_file.read()
    bs = bs4.BeautifulSoup(raw_file,'xml')
    

    for el in bs.findAll('DOC'):
        if el.TEXT:
            body = el.TEXT.text
            for im in el.TEXT.findAll('IMAGE'):
                body = body.replace(im.text,'')
            temp_dic = {
                'title' : el.TITLE.text,
                'body' : body,
                'cat' : el.CAT.text
            }
            dic[el.DOCID.text] = temp_dic

len(dic)

In [None]:

vocabulary = []

for key in dic.keys():
    tf_dict = {}
    normalized_title = normalizer.normalize(dic[key]['title'])
    tokenized_title = hzm.word_tokenize(normalized_title)
    normalized_body = normalizer.normalize(dic[key]['body'])
    tokenized_body = hzm.word_tokenize(normalized_body)
    tokens = tokenized_title + tokenized_body

    for token in tokens:
        vocabulary.append(token)
        if token in tf_dict.keys():
            tf_dict[token] += 1
        else:
            tf_dict[token] =1
    for word in tf_dict.keys():
        tf_dict[word] /= len(tokens)
    dic[key]['tf'] = tf_dict

vocabulary = set(vocabulary)


ID = []
TF = []
for key in dic.keys():
    ID.append(key)
    TF.append(dic[key]['tf'])

DF_REF = {'ID':ID,'TF':TF }


df_tf = pd.DataFrame(DF_REF)

IDF = []
for word in vocabulary:
    DF = 0
    for doc_id in dic.keys():
            if word in dic[doc_id]['tf'].keys():
                DF+=1
    IDF.append(math.log((len(dic)/DF)))
DF_REF = {'TERM':list(vocabulary), 'IDF':IDF}
df_idf = pd.DataFrame(DF_REF)

df_tf.to_csv('ID_TF.csv')
df_idf.to_csv('TERM_IDF.csv')

In [None]:
df_idf = pd.read_csv('TERM_IDF.csv')
df_idf.drop('Unnamed: 0',axis=1,inplace=True)
df_idf.set_index('TERM',inplace=True)
df_idf

In [None]:
df_tf = pd.read_csv('ID_TF.csv')
df_tf.drop('Unnamed: 0',axis=1,inplace=True)
for i in range(len(df_tf)):
    dic_tf = ast.literal_eval(df_tf.loc[i].TF)
    for key in dic_tf.keys():
        idf = df_idf.loc[key].IDF if key in df_idf.index else 0
        dic_tf[key] = dic_tf[key] * idf
    df_tf.loc[i].TF = dic_tf
df_tf.rename(columns = {'TF':'WEIGHT'},inplace=True)
df_tf

## Read the queries

In [None]:
query_path = join('dataset','query-HAM2-FA-ENG','query-HAM2-FA.txt')
queries_raw = open(query_path,'r').read()

bs = bs4.BeautifulSoup(queries_raw,'xml')

queries = {}
for q in bs.findAll('QUERY'):
    queries[int(q.ID.text)] = q.TITLE.text

len(queries)

In [None]:
def vectorize_query(q):
    normalized_query = normalizer.normalize(q)
    tokenized_query = hzm.word_tokenize(normalized_query)
    tokenized_query
    q_tf = {}
    for token in tokenized_query:
        if token in q_tf.keys():
            q_tf[token] +=1
        else:
            q_tf[token] = 1
    q_vector = {}
    for key in q_tf.keys():
        q_tf[key] /= len(tokenized_query)
        idf = df_idf.loc[key].IDF if key in df_idf.index else 0
        q_vector[key] = q_tf[key] * idf
    return q_vector

In [None]:
def cosine_similarity(q_vector,doc_vector):
    numerator = 0
    for key in q_vector.keys():
        if key in doc_vector.keys():
            numerator += (doc_vector[key]*q_vector[key])
    print(f'numerator = {numerator}')

    if numerator == 0:
        result = 0
    else:
        denominator = 0

        q_vector_size = 0
        for key in q_vector.keys():
            q_vector_size += q_vector[key]**2
        q_vector_size = math.sqrt(q_vector_size)

        doc_vector_size = 0
        for key in doc_vector.keys():
            doc_vector_size += doc_vector[key]**2
        doc_vector_size = math.sqrt(doc_vector_size)

        denominator = doc_vector_size * q_vector_size

        result = numerator/denominator
    return result

In [None]:
queries_result = {}
for key in queries.keys():
    q_vector = vectorize_query(queries[key])
    print(f'Query is : {queries[key]}')
    results = []
    for i in range(len(df_tf)):
        row = df_tf.loc[i]
        doc_vector = row.WEIGHT
        doc_id = row.ID
        print(f'{doc_id} is beeing processed, {i}/{len(df_tf)}')
        result = cosine_similarity(q_vector,doc_vector)
        print(f'Similarity equals to {result}')
        results.append((doc_id,result))
    results = list(filter(lambda x:x[1] != 0,results))
    results.sort(reverse=True, key = lambda x:x[1])
    queries_result[queries[key]] = results


        

In [None]:
file = open('Queries_results','w')
for key in queries_result.keys():
    file.write(f'کوئری : {key}\n')
    file.write(f'۱۰ نتیجه اول : \n')
    results = queries_result[key][0:10]
    for result in results:
        file.write(f"({result[0]},{dic[result[0]]['title']})")
    
    file.write('\n----------------------------------------------------------------------------------------\n')
file.close()