## Reading From File

In [None]:
import json

In [None]:
def read_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {file_path}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


In [None]:
news_data = read_json_file('./IR_data_news_5k.json')

In [None]:
contents = [news_data[i]['content'] for i in news_data]
contents_id = [i for i in news_data]

## Document Preprocessing

In [None]:
from parsivar import FindStems

In [None]:
import re

class Tokenizer():
    def __init__(self):
        pass

    def tokenize_words(self, doc_string):
        token_list = doc_string.strip().split()
        token_list = [x.strip("\u200c") for x in token_list if len(x.strip("\u200c")) != 0]
        return token_list

    def tokenize_sentences(self, doc_string):
        #finding the numbers
        pattern = r"[-+]?\d*\.\d+|\d+"
        nums_list = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)

        pattern = r'([!\.\?؟]+)[\n]*'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        pattern = r':\n'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        pattern = r';\n'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        pattern = r'؛\n'
        tmp = re.findall(pattern, doc_string)
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        pattern = r'[\n]+'
        doc_string = re.sub(pattern, self.add_tab, doc_string)

        for number in nums_list:
            pattern = 'floatingpointnumber'
            doc_string = re.sub(pattern, number, doc_string, 1)

        doc_string = doc_string.split('\t\t')
        doc_string = [x for x in doc_string if len(x) > 0]
        return doc_string

    def add_tab(self, mystring):
        mystring = mystring.group()  # this method return the string matched by re
        mystring = mystring.strip(' ')  # ommiting the whitespace around the pucntuation
        mystring = mystring.strip('\n') # ommiting the newline around the pucntuation
        mystring = " " + mystring + "\t\t"  # adding a space after and before punctuation
        return mystring

In [None]:
from typing import Dict, Any
import re

def maketrans(a: str, b: str) -> Dict[int, Any]:
    return {ord(a): b for a, b in zip(a, b)}

def regex_replace(patterns: str, text: str) -> str:
    for pattern, repl in patterns:
        text = re.sub(pattern, repl, text)
    return text

In [None]:

def correct_spacing(text):
    extra_space_patterns = [
            (r" {2,}", " "),
            (r"\n{3,}", "\n\n"), 
            (r"\u200c{2,}", "\u200c"),
            (r"\u200c{1,} ", " "), 
            (r" \u200c{1,}", " "), 
            (r"\b\u200c*\B", ""), 
            (r"\B\u200c*\b", ""),  
            (r"[ـ\r]", ""),
        ]
    
    text = regex_replace(extra_space_patterns, text)
    return text

class Normalizer():
    def normalize(self, content):
        translation_src = "ؠػػؽؾؿكيٮٯٷٸٹٺٻټٽٿڀځٵٶٷٸٹٺٻټٽٿڀځڂڅڇڈډڊڋڌڍڎڏڐڑڒړڔڕږڗڙښڛڜڝڞڟڠڡڢڣڤڥڦڧڨڪګڬڭڮڰڱڲڳڴڵڶڷڸڹںڻڼڽھڿہۂۃۄۅۆۇۈۉۊۋۏۍێېۑےۓەۮۯۺۻۼۿݐݑݒݓݔݕݖݗݘݙݚݛݜݝݞݟݠݡݢݣݤݥݦݧݨݩݪݫݬݭݮݯݰݱݲݳݴݵݶݷݸݹݺݻݼݽݾݿࢠࢡࢢࢣࢤࢥࢦࢧࢨࢩࢪࢫࢮࢯࢰࢱࢬࢲࢳࢴࢶࢷࢸࢹࢺࢻࢼࢽﭐﭑﭒﭓﭔﭕﭖﭗﭘﭙﭚﭛﭜﭝﭞﭟﭠﭡﭢﭣﭤﭥﭦﭧﭨﭩﭮﭯﭰﭱﭲﭳﭴﭵﭶﭷﭸﭹﭺﭻﭼﭽﭾﭿﮀﮁﮂﮃﮄﮅﮆﮇﮈﮉﮊﮋﮌﮍﮎﮏﮐﮑﮒﮓﮔﮕﮖﮗﮘﮙﮚﮛﮜﮝﮞﮟﮠﮡﮢﮣﮤﮥﮦﮧﮨﮩﮪﮫﮬﮭﮮﮯﮰﮱﺀﺁﺃﺄﺅﺆﺇﺈﺉﺊﺋﺌﺍﺎﺏﺐﺑﺒﺕﺖﺗﺘﺙﺚﺛﺜﺝﺞﺟﺠﺡﺢﺣﺤﺥﺦﺧﺨﺩﺪﺫﺬﺭﺮﺯﺰﺱﺲﺳﺴﺵﺶﺷﺸﺹﺺﺻﺼﺽﺾﺿﻀﻁﻂﻃﻄﻅﻆﻇﻈﻉﻊﻋﻌﻍﻎﻏﻐﻑﻒﻓﻔﻕﻖﻗﻘﻙﻚﻛﻜﻝﻞﻟﻠﻡﻢﻣﻤﻥﻦﻧﻨﻩﻪﻫﻬﻭﻮﻯﻰﻱﻲﻳﻴىكي“” "
        translation_dst = ('یککیییکیبقویتتبتتتبحاوویتتبتتتبحححچدددددددددررررررررسسسصصطعففففففققکککککگگگگگللللنننننهچهههوووووووووییییییهدرشضغهبببببببححددرسعععففکککممنننلررسححسرحاایییووییحسسکببجطفقلمییرودصگویزعکبپتریفقنااببببپپپپببببتتتتتتتتتتتتففففححححححححچچچچچچچچددددددددژژررککککگگگگگگگگگگگگننننننههههههههههییییءاااووااییییااببببتتتتثثثثججججححححخخخخددذذررززسسسسششششصصصصضضضضططططظظظظععععغغغغففففققققککککللللممممننننههههوویییییییکی"" ')
        suffixes = {
                    "ی",
                    "ای",
                    "ها",
                    "های",
                    "هایی",
                    "تر",
                    "تری",
                    "ترین",
                    "گر",
                    "گری",
                    "ام",
                    "ات",
                    "اش",
                }
        replacements = [
                    ("﷽", "بسم الله الرحمن الرحیم"),
                    ("﷼", "ریال"),
                    ("(ﷰ|ﷹ)", "صلی"),
                    ("ﷲ", "الله"),
                    ("ﷳ", "اکبر"),
                    ("ﷴ", "محمد"),
                    ("ﷵ", "صلعم"),
                    ("ﷶ", "رسول"),
                    ("ﷷ", "علیه"),
                    ("ﷸ", "وسلم"),
                    ("ﻵ|ﻶ|ﻷ|ﻸ|ﻹ|ﻺ|ﻻ|ﻼ", "لا"),
                ]
        number_translation_src = "0123456789%٠١٢٣٤٥٦٧٨٩"
        number_translation_dst = "۰۱۲۳۴۵۶۷۸۹٪۰۱۲۳۴۵۶۷۸۹"
        specials_chars_patterns = [
            # Remove almoast all arabic unicode superscript and subscript characters in the ranges of 00600-06FF, 08A0-08FF, FB50-FDFF, and FE70-FEFF
            (
                "[\u0605\u0653\u0654\u0655\u0656\u0657\u0658\u0659\u065a\u065b\u065c\u065d\u065e\u065f\u0670\u0610\u0611\u0612\u0613\u0614\u0615\u0616\u0618\u0619\u061a\u061e\u06d4\u06d6\u06d7\u06d8\u06d9\u06da\u06db\u06dc\u06dd\u06de\u06df\u06e0\u06e1\u06e2\u06e3\u06e4\u06e5\u06e6\u06e7\u06e8\u06e9\u06ea\u06eb\u06ec\u06ed\u06fd\u06fe\u08ad\u08d4\u08d5\u08d6\u08d7\u08d8\u08d9\u08da\u08db\u08dc\u08dd\u08de\u08df\u08e0\u08e1\u08e2\u08e3\u08e4\u08e5\u08e6\u08e7\u08e8\u08e9\u08ea\u08eb\u08ec\u08ed\u08ee\u08ef\u08f0\u08f1\u08f2\u08f3\u08f4\u08f5\u08f6\u08f7\u08f8\u08f9\u08fa\u08fb\u08fc\u08fd\u08fe\u08ff\ufbb2\ufbb3\ufbb4\ufbb5\ufbb6\ufbb7\ufbb8\ufbb9\ufbba\ufbbb\ufbbc\ufbbd\ufbbe\ufbbf\ufbc0\ufbc1\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63\ufcf2\ufcf3\ufcf4\ufd3e\ufd3f\ufe70\ufe71\ufe72\ufe76\ufe77\ufe78\ufe79\ufe7a\ufe7b\ufe7c\ufe7d\ufe7e\ufe7f\ufdfa\ufdfb]",
                "",
            ),
        ]
        text = content
        translations = maketrans(translation_src, translation_dst)
        text = text.translate(translations)
        translations = maketrans(
            number_translation_src,
            number_translation_dst,
        )
        text = text.translate(translations)
        diacritics_patterns = [
            # remove FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SHADDA, SUKUN
            ("[\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652]", ""),
        ]
        text = regex_replace(diacritics_patterns, text)
        text = correct_spacing(text)
        for old, new in replacements:
                text = re.sub(old, new, text)
        text = regex_replace(specials_chars_patterns, text)
        char_delete = ['،', '.', ')', '(', '}', '{', '«', '»', '؛', ':', '؟', '>', '<', '|', '+', '-', '*', '^', '%', '#', '=', '_', '/', '«', '»', '$', '[', ']', '&', "❊", '«', '»', '"', '!', "'"]
        alphabet = 'QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm?!.\xad'
        for c in char_delete:
            text = text.replace(c, "")
        for c in alphabet:
            text = text.replace(c, "")
        return text

In [None]:
from collections import Counter

def get_top_tokens(contents, num):
    all_tokens = [token for content in contents for token in content]
    top_tokens = Counter(all_tokens).most_common(num)

    for token, count in top_tokens:
        print(f"{token}: {count} occurrences")

    return [token for token, _ in Counter(all_tokens).most_common(num)]    

def get_tokens(content, normalizer, tokenizer, stemmer, stopwords):
        # Normalizing
        normalized_content = normalizer.normalize(content)
       
        # Tokenizing
        content_tokens = tokenizer.tokenize_words(normalized_content)

        # Preprocess tokens
        tokens = [
            stemmer.convert_to_stem(token)
            for token in content_tokens
            if not (token in stopwords)
        ]
        
        return tokens

def preprocess(contents, normalizer, tokenizer, stemmer, stopwords, remove_top_n=50):
    preprocessed_docs = []

    for content in contents:
        preprocessed_docs.append(get_tokens(content, normalizer, tokenizer, stemmer, stopwords))
        
    top_tokens = get_top_tokens(preprocessed_docs, remove_top_n)
    preprocessed_docs = [
        [token for token in tokens if token not in top_tokens]
        for tokens in preprocessed_docs
    ]

    return preprocessed_docs

In [None]:
stopwords = [
    'و', 'در', 'به', 'از', 'که', 'این', 'را', 'با', 'است', 'برای',
    'آن', 'یک', 'خود', 'تا', 'کرد', 'بر', 'هم', 'نیز', 'گفت', 'می‌شود',
    'وی', 'شد', 'دارد', 'ما', 'اما', 'یا', 'شده', 'باید', 'هر', 'آنها',
    'بود', 'او', 'دیگر', 'دو', 'مورد', 'می‌کند', 'شود', 'کند', 'وجود',
    'بین', 'پیش', 'شده_است', 'پس', 'نظر', 'اگر', 'همه', 'یکی', 'حال',
    'هستند', 'من', 'کنند', 'نیست', 'باشد', 'چه', 'بی', 'می', 'بخش',
    'می‌کنند', 'همین', 'افزود', 'هایی', 'دارند', 'راه', 'همچنین', 'روی',
    'داد', 'بیشتر', 'بسیار', 'سه', 'داشت', 'چند', 'سوی', 'تنها', 'هیچ',
    'میان', 'اینکه', 'شدن', 'بعد', 'جدید', 'ولی', 'حتی', 'کردن', 'برخی',
    'کردند', 'می‌دهد', 'اول', 'نه', 'کرده_است', 'نسبت', 'بیش', 'شما',
    'چنین', 'طور', 'افراد', 'تمام', 'درباره', 'بار', 'بسیاری', 'می‌تواند',
    'کرده', 'چون', 'ندارد', 'دوم', 'بزرگ', 'طی', 'حدود', 'همان', 'بدون',
    'البته', 'آنان', 'می‌گوید', 'دیگری', 'خواهد_شد', 'کنیم', 'قابل',
    'یعنی', 'رشد', 'می‌توان', 'وارد', 'کل', 'ویژه', 'قبل', 'براساس', 'نیاز',
    'گذاری', 'هنوز', 'لازم', 'سازی', 'بوده_است', 'چرا', 'می‌شوند', 'وقتی',
    'گرفت', 'کم', 'جای', 'حالی', 'تغییر', 'پیدا', 'اکنون', 'تحت', 'باعث',
    'مدت', 'فقط', 'زیادی', 'تعداد', 'آیا', 'بیان', 'رو', 'شدند', 'عدم',
    'کرده_اند', 'بودن', 'نوع', 'بلکه', 'جاری', 'دهد', 'برابر', 'مهم', 'بوده',
    'اخیر', 'مربوط', 'امر', 'زیر', 'گیری', 'شاید', 'خصوص', 'آقای', 'اثر',
    'کننده', 'بودند', 'فکر', 'کنار', 'اولین', 'سوم', 'سایر', 'کنید', 'ضمن',
    'مانند', 'باز', 'می‌گیرد', 'ممکن', 'حل', 'دارای', 'پی', 'مثل', 'می‌رسد',
    'اجرا', 'دور', 'منظور', 'کسی', 'موجب', 'طول', 'امکان', 'آنچه', 'تعیین',
    'گفته', 'شوند', 'جمع', 'خیلی', 'علاوه', 'گونه', 'تاکنون', 'رسید', 'ساله',
    'گرفته', 'شده_اند', 'علت', 'چهار', 'داشته_باشد', 'خواهد_بود', 'طرف', 'تهیه',
    'تبدیل', 'مناسب', 'زیرا', 'مشخص', 'می‌توانند', 'نزدیک', 'جریان', 'روند',
    'بنابراین', 'می‌دهند', 'یافت', 'نخستین', 'بالا', 'پنج', 'ریزی', 'عالی',
    'چیزی', 'نخست', 'بیشتری', 'ترتیب', 'شده_بود', 'خاص', 'خوبی', 'خوب',
    'شروع', 'فرد', 'کامل', 'غیر', 'می‌رود', 'دهند', 'آخرین', 'دادن', 'جدی',
    'بهترین', 'شامل', 'گیرد', 'بخشی', 'باشند', 'تمامی', 'بهتر', 'داده_است',
    'حد', 'نبود', 'کسانی', 'می‌کرد', 'داریم', 'علیه', 'می‌باشد', 'دانست',
    'ناشی', 'داشتند', 'دهه', 'می‌شد', 'ایشان', 'آنجا', "'", '@', '±','Ø','é','ú','ءامنوا','آارء','آباداناستان','آبادانشمس','آبادسازی','آبادصنعت','آبادمغان','آباد۳۴','آباندر','آبانماه', 'آباکاروف','آببا','آبخواه','آبدی','آبدیدگی','آبرامز','آبرودار','آبرومندی']
# stopwords = []

In [None]:
preprocessed_docs = preprocess(contents, Normalizer(), Tokenizer(), FindStems(), stopwords)

In [None]:
print(preprocessed_docs[0])

## Positional indexing

In [None]:
from collections import defaultdict

In [None]:
class Term:
    def __init__(self):
        self.total_frequency = 0
        self.positions = defaultdict(list)
        self.frequency = defaultdict(int)

    def update_posting(self, doc_id, term_pos):
        self.positions[doc_id].append(term_pos)
        self.frequency[doc_id] += 1
        self.total_frequency += 1

In [None]:
def positional_indexing(preprocessed_docs):
    positional_inverted_indexing = {}

    for doc_id, terms in enumerate(preprocessed_docs):
        for pos, term in enumerate(terms):
            term_obj = positional_inverted_indexing.setdefault(term, Term())
            term_obj.update_posting(doc_id, pos)
            positional_inverted_indexing[term] = term_obj

    return positional_inverted_indexing

In [None]:
positional_index = positional_indexing(preprocessed_docs)

In [None]:
dict_list = []
for term in positional_index:
    dict_list.append((len(positional_index[term].frequency), term))
dict_list.sort()
print(dict_list[len(positional_index) - 1], dict_list[len(positional_index) - 2], dict_list[len(positional_index) - 3])
print(dict_list[0], ":", positional_index[dict_list[0][1]].frequency)
print(dict_list[1], ":", positional_index[dict_list[1][1]].frequency)
print(dict_list[2], ":", positional_index[dict_list[2][1]].frequency)

## Vectorizing

In [None]:
import math
import numpy as np
from collections import Counter
from scipy.sparse import csr_matrix

In [None]:
def calculate_tf(term_frequency):
    return 1 + math.log(term_frequency) if term_frequency > 0 else 0

def calculate_idf(len_documents, document_count):
    return math.log(len_documents / (1 + document_count))

def tfidf_vectorizer(documents):
    rows, cols, data = [], [], []
    for doc_id, document in enumerate(documents):
        term_frequencies = Counter(document)
        for term, freq in term_frequencies.items():
            if term not in unique_terms_dic:
                continue
            tfidf_weight = calculate_tf(freq) * idf_values[term]
            if tfidf_weight != 0:
                rows.append(doc_id)
                cols.append(unique_terms_dic[term])
                data.append(tfidf_weight)

    tfidf_matrix = csr_matrix((data, (rows, cols)), shape=(len(documents), len(unique_terms)))
    return tfidf_matrix

In [None]:
len_documents = len(preprocessed_docs)
unique_terms = sorted(set(term for document in preprocessed_docs for term in document))
unique_terms_dic = {term: index for index, term in enumerate(unique_terms)}
idf_values = {term: calculate_idf(len_documents, len(positional_index[term].frequency)) for term in unique_terms}
tfidf_matrix = tfidf_vectorizer(preprocessed_docs)

In [None]:
print("length of tf_idf vectors:" ,len(unique_terms))

term_max_idf = 'فوتسال'
for term in idf_values:
    if idf_values[term_max_idf] < idf_values[term]:
        term_max_idf = term
print("term with max idf :", term_max_idf, "and idf value is :", idf_values[term_max_idf])
term_min_idf = 'فوتسال'
for term in idf_values:
    if idf_values[term_min_idf] > idf_values[term]:
        term_min_idf = term
print("term with min idf :", term_min_idf, "and idf value is :", idf_values[term_min_idf])
term_max_idf = ""
max_weight = 0
term_min_idf = ""
min_weight = 100000000000
term_frequencies = Counter(preprocessed_docs[0])
for term, freq in term_frequencies.items():
    if term not in unique_terms_dic:
        continue
    tfidf_weight = calculate_tf(freq) * idf_values[term]
    if tfidf_weight == 0:
        continue
    if tfidf_weight > max_weight:
        max_weight = tfidf_weight
        term_max_idf = term
    if tfidf_weight < min_weight:
        min_weight = tfidf_weight
        term_min_idf = term
print(f"for document {contents_id[0]} term with max idf :", term_max_idf, "and idf value is :", idf_values[term_max_idf])
print(f"for document {contents_id[0]} term with min idf :", term_min_idf, "and idf value is :", idf_values[term_min_idf])

In [None]:
print(unique_terms)

## Create Champion List

In [None]:
import heapq


def create_champion_lists(matrix, champion_list_size): 
    champion_lists = {}
    

    for term_index, term in enumerate(unique_terms_dic):
        row_indices, data = matrix[:, term_index].nonzero()
        champion_lists[term_index] = [(matrix[row_indices[i], term_index], row_indices[i]) for i in range(len(row_indices))]
        heapq.heapify(champion_lists[term_index])
        champion_lists[term_index] = heapq.nlargest(champion_list_size, champion_lists[term_index])

    return champion_lists

In [None]:
champion_list_size = 5

champion_lists = create_champion_lists(tfidf_matrix, champion_list_size)

## Answer Queries

In [None]:
def cosine_similarity(vector_a, vector_b):
    dot_product = vector_a.multiply(vector_b).sum()

    norm_a = np.linalg.norm(vector_a.toarray())
    norm_b = np.linalg.norm(vector_b.toarray())

    return dot_product / (norm_a * norm_b) if norm_a > 0 and norm_b > 0 else 0

def calculate_cosine_similarities_with_index_elimination(query_vector, document_vectors, k):
    similarities_dict = {}

    for doc_id, document_vector in enumerate(document_vectors):
        similarity = cosine_similarity(query_vector, document_vector)
        if similarity > 0:
            similarities_dict[doc_id] = similarity

    top_k_documents = heapq.nlargest(k, similarities_dict, key=similarities_dict.get)
    output = {}
    for i in top_k_documents:
        output[i] = similarities_dict[i]
    return top_k_documents, output

def calculate_cosine_similarities_with_champions_list(query_vector, k):
    similarities_dict = {}
    term_indices = query_vector.getrow(0).indices
    for term_index in term_indices:
        if term_index in champion_lists:
            for doc_weight, doc_id in champion_lists[term_index]:
                if doc_id not in similarities_dict:
                    similarities_dict[doc_id] = 0
                similarities_dict[doc_id] += query_vector[0, doc_id] * doc_weight

    top_k_documents = heapq.nlargest(k, similarities_dict, key=similarities_dict.get)
    output = {}
    for i in top_k_documents:
        output[i] = similarities_dict[i]
    return top_k_documents, output

In [None]:
def calculate_cosine_similarities_docs(query_vector, query_id, document_vectors, k):
    similarities_dict = {}

    for doc_id, document_vector in enumerate(document_vectors):
        if doc_id == query_id:
            continue
        similarity = cosine_similarity(query_vector, document_vector)
        if similarity > 0:
            similarities_dict[doc_id] = similarity


    top_k_documents = heapq.nlargest(k, similarities_dict, key=similarities_dict.get)
    output = {}
    for i in top_k_documents:
        output[i] = similarities_dict[i]
    return output

print(calculate_cosine_similarities_docs(tfidf_matrix[0],0, tfidf_matrix, k=1))
print(contents[1782], contents_id[1782])

In [None]:
user_query = "نوواک جوکوویچ مسابقات گرند پری استرالیا"
print(user_query)

query_vector = tfidf_vectorizer([get_tokens(user_query, Normalizer(), Tokenizer(), FindStems(), stopwords)])[0]
top_k_documents, values = calculate_cosine_similarities_with_index_elimination(query_vector, tfidf_matrix, k=5)
for rank, doc_id in enumerate(top_k_documents, start=1):
    print(f"Rank {rank}: Document {contents_id[doc_id]}  \nContent: {news_data[contents_id[doc_id]]}\n")
# 
top_k_documents, values = calculate_cosine_similarities_with_champions_list(query_vector, k=5)
for rank, doc_id in enumerate(top_k_documents, start=1):
    print(f"Rank {rank}: Document {contents_id[doc_id]} \nContent: {news_data[contents_id[doc_id]]}\n")

In [None]:
import pickle
 
def storeData():
    # database
    db = {}
    db['Positional_Index'] = positional_index
     
    with open('./db', 'ab') as dbfile:
        # Serialize and store the data using pickle
        pickle.dump(db, dbfile)
 
def loadData():
    with open('./db', 'rb') as dbfile:
        # Deserialize the data using pickle
        db = pickle.load(dbfile)
        for key, value in db.items():
            print(key, '=>', value)
 
storeData()
loadData()