In [1]:
import pandas as pd
import numpy as np
import re
from re import search
import codecs
import sys

In [345]:
stop_words = [
    "از",
    "این",
    "آن",
    "به",
    "با",
    "بر",
    "برای",
    "پس",
    "تا",
    "در",
    "را",
    "که",
    "و",
]

prefixes = [
    "با",
    "بی",
    "نا"
]

postfixes = [
    "تر",
    "ترین",
    "ات",
    "ها"
]

verb_roots = [
    "گفت",
    "رفت",
    "شد"
]

common_words = [
    "میدان",
    "دانش",
    "رفتار"
]

plural_singular = [
    ["منابع", "منبع"],
    ["مراجع", "مرجع"],
    ["اخبار", "خبر"]
]


def read_dataset(path, name):
    df = pd.read_excel(path + name)
    return df

def delete_punctuations(doc):
    punctuations = '،:؛؟!»«()[]"*,{.}@!?'
    edited_doc = doc.translate(str.maketrans('', '', punctuations))
    return edited_doc

def delete_stopWords(doc):
    edited_doc = doc
    for s in stop_words:
        my_regex = r"\b"+s+r"\b"
        edited_doc = re.sub(my_regex , "", edited_doc)
    return edited_doc

def delete_highFrequencyWords(inverted_indexes):
    df_temp = df.copy()
    for i in inverted_indexes:
        if len(i[1])/len(df_temp) > 0.5:
            inverted_indexes.remove(i)
    return inverted_indexes

def delete_postfixes(doc):
    edited_doc = doc
    for p in postfixes:
        my_regex = p + r"\b"
        edited_doc = re.sub(my_regex , "", edited_doc)
    return edited_doc

def delete_prefixes(doc):
    edited_doc = doc
    for p in postfixes:
        my_regex = r"\b" + p
        edited_doc = re.sub(my_regex , "", edited_doc)
    return edited_doc

def replaceWithRoot(tokens):   
    for i in range(0, len(tokens)):
        for root in verb_roots:
            if search(root, tokens[i]):
                for c in common_words:
                    if c != tokens[i]:
                        tokens[i] = tokens[i].replace(tokens[i], root)
                    else:
                        print("Common")

    return tokens

def replaceArabicWords(doc):
    doc = doc.replace('ك', 'ک')
    doc = doc.replace('ئ', 'ی')
    doc = doc.replace('ي', 'ی')
    doc = doc.replace('ؤ', 'و')
    doc = doc.replace('هٔ', 'ه')
    doc = doc.replace('ة', 'ه')
    doc = doc.replace('آ', 'ا')
    doc = doc.replace('أ', 'ا')
    doc = doc.replace('إ', 'ا')
    return doc

def pluralToSingular(tokens):
    for i in range(0, len(tokens)):
        for ps in plural_singular:
            if search(ps[0], tokens[i]):
                tokens[i] = tokens[i].replace(tokens[i], ps[1])
    return tokens

def tokenize(df):
    content = df.content

    tokens = []
    for i in range(0, content.size):
        doc = content[i]
        # 68000 tokens
        doc = delete_punctuations(doc)
        # 50000 tokens
        doc = delete_stopWords(doc)

        doc = delete_postfixes(doc)

        doc = delete_prefixes(doc)

        doc = replaceArabicWords(doc)
        
        tokenized_doc = doc.split()
        tokenized_doc = replaceWithRoot(tokenized_doc)
        tokenized_doc = pluralToSingular(tokenized_doc)

        for token in tokenized_doc:
            temp = []
            temp.append(token)
            temp.append(df.id[i])
            tokens.append(temp)
    tokens.sort()
    return tokens

def create_inverted_indexes(tokens):
    inverted_indexes = []
    doc_temp = []
    token_temp = ""
    for token in tokens:
        if token[0] == token_temp:
            doc_temp.append(token[1])
        else:
            temp = []
            temp.append(token_temp)
            doc_temp = set(doc_temp)
            temp.append(doc_temp)
            inverted_indexes.append(temp)
            doc_temp = []

        token_temp = token[0]
        # if len(doc_temp) > 0 and token[1] != doc_temp[-1]:
        doc_temp.append(token[1])
    
    inverted_indexes = delete_highFrequencyWords(inverted_indexes)
    return inverted_indexes

def search_token(inverted_indexes, token_name):
    for token in inverted_indexes:
        if token[0] == token_name:
            return token[1]
    print("this token not exist in our database")

def query_processing(df, inverted_indexes, query):
    res = []
    docs_id = pd.DataFrame()


    query = delete_punctuations(query)
    query = delete_stopWords(query)
    query = delete_postfixes(query)
    query = delete_prefixes(query)
    query = replaceArabicWords(query)

    tokenized_query = query.split()
    tokenized_query = replaceWithRoot(tokenized_query)
    tokenized_query = pluralToSingular(tokenized_query)


    for i in range (0 ,len(tokenized_query)):
        temp_docs_id = pd.DataFrame(search_token(inverted_indexes, tokenized_query[i]))
        docs_id = pd.DataFrame(docs_id.append(temp_docs_id))

    docs_id = docs_id[0].value_counts().reset_index()
    docs_id.columns = ['id', 'rank']
    print_res(df, docs_id)
    
def print_res(df, res):
    p_res = pd.DataFrame()

    p_res = pd.merge(df, res, on=['id'], how='inner')
    our_res = p_res[['id', 'rank', 'url']]
    our_res = our_res.sort_values(["rank"], ascending=False)

    print(our_res)
    


    


In [3]:
df = read_dataset("datasets/", "IR_Spring2021_ph12_7k.xlsx")


In [4]:
tokens = tokenize(df)
inverted_indexes = create_inverted_indexes(tokens)


In [6]:
print(len(search_token(inverted_indexes, "خبر")))


1711466
47433


In [5]:
print(len(tokens))
print(len(inverted_indexes))

# for i in range(5000, 5100):
#     print(inverted_indexes[i][0])
#     print(inverted_indexes[i][1])

872


In [349]:
query = "بازیکن فرانسوی به تیم کهکشانی مادرید"
query_processing(df, inverted_indexes, query)


        id  rank                                                url
118    220     5  https://www.isna.ir/news/99040302540/۴۸-سالگی-...
831   1363     3  https://www.isna.ir/news/98070302858/اتلتیکو-ب...
1076  1697     3  https://www.isna.ir/news/98121713587/توقف-اتلت...
738   1202     3  https://www.isna.ir/news/98041306951/خداحافظی-...
987   1580     2  https://www.isna.ir/news/98102620428/میشو-تصمی...
...    ...   ...                                                ...
480    796     1  https://www.isna.ir/news/99100604356/صحبت-های-...
481    797     1  https://www.isna.ir/news/99100604215/انتقاد-حس...
482    799     1  https://www.isna.ir/news/99100704952/پیکر-دکتر...
484    801     1  https://www.isna.ir/news/99100805979/استقلال-و...
1249  6990     1  https://www.isna.ir/news/98091611102/هرخانه-یک...

[1250 rows x 3 columns]
