<div class="alert alert-block alert-success">
    <h1 align="center">Information Retrival Systems</h1>
    <h2 align="center">Simple search engine project</h2>
    <h3 align="center">Phase 2</h3>
</div>

## Loading Libraries

In [18]:
from parsivar import Normalizer, Tokenizer, FindStems
from hazm import stopwords_list
import pandas as pd
import pickle
import math

## Loading raw and processed Data
- loading the json file as a dataframe and also transpose it
- loading preprocessed content
- loading inverted index
- create dictionary according to inverted index

In [16]:
df = pd.read_json('IR_data_news_12k.json')
df = df.transpose()

In [6]:
def load_file(filename):
    my_file = None
    with open(filename, 'rb') as inp:
        my_file = pickle.load(inp)
    return my_file

In [7]:
preprocessed_content = load_file('./Processed data/preprocessed_content.pkl')

In [8]:
inverted_index = load_file('./Processed data/inverted_index.pkl')

In [9]:
dictionary = list(inverted_index.keys())

# Step1)
## Modeling documents in vector space
- Here we should define a function that calculate tf_idf score. The formula explained below:
$$tfidf(t, d, D) = tf(t, d) \times idf(t, D) = \ (1 + log(f_{t, d})) \times \log(\frac{N}{n_t})$$    

In [10]:
def tfidf(term, doc, doc_collection, total_number_of_docs):
    # idf is a constant value for each token and it doesnt depend on query
    N = total_number_of_docs
    nt = doc_collection[term]['doc_frequency']
    idf = math.log10(N / nt)
    
    # now we should compute number of term in that doc
    term_count = doc.count(term)
    if term_count == 0:
        tf = 0
    else:
        tf = 1+math.log10(term_count)
    
    return  tf*idf

In [19]:
print(tfidf('خبرگزاری',df.iloc[3]['content'], inverted_index , len(df)))

0.007249774608743844


# Step2)
## Answering the queries in the vector space  

- Here we should define a function for calculating cosine similarity. Also we define a function to compute Doc_length.
- For this part we help from pseudo code in slides.


In [20]:
def get_length(doc_id):
    l = 0
    for token in preprocessed_content[doc_id]:
        l += tfidf(term=token, doc=preprocessed_content[doc_id], doc_collection=inverted_index, total_number_of_docs=len(df))**2
    return l

In [21]:
def cosine(query, k, inverted_index, is_champion=True):
    docID_score = {}
    doc_ids = []
    for term in query:
        # step 1: calculate weight of term in the query
        weight_in_query = tfidf(term=term, doc=query, doc_collection=inverted_index, total_number_of_docs=len(df))
        # step 2: get postings list for term (if is_champion is true we should use ch)
        if is_champion:
            postings_list = inverted_index[term]['champion']
        else:
            postings_list = inverted_index[term]['posting_list']
            
        for posting in postings_list:
            if is_champion:
                doc_id = posting[0]
            else:
                doc_id = list(posting.keys())[0]
            # step 3: calculate weight of term in the Doc
            weight_in_doc = tfidf(term=term, doc=df.iloc[doc_id]['content'],  doc_collection=inverted_index, total_number_of_docs=len(df))
            
            # step4:save or update related score of a doc
            if docID_score.get(doc_id):
                docID_score[doc_id]['score'] += weight_in_query * weight_in_doc
                #docID_score[doc_id]['squar length'] += (weight_in_doc ** 2)
            else:
                doc_ids.append(doc_id)
                #docID_score[doc_id]= {'squar length': weight_in_doc ** 2, 'score':weight_in_query * weight_in_doc}
                docID_score[doc_id]= {'squar length': get_length(doc_id), 'score':weight_in_query * weight_in_doc}
    # step5: normalizing
    new_docID_score = []
    for doc_id in doc_ids:
        docID_score[doc_id]['score'] = docID_score[doc_id]['score']/math.sqrt(docID_score[doc_id]['squar length'])
        new_docID_score.append((doc_id,docID_score[doc_id]['score']))
    # step6: select k best:
    new_docID_score = [(docID_score[0],docID_score[1]['score']) for docID_score in docID_score.items()]
    if k > len(new_docID_score):
        k = len(new_docID_score)
    k_best = []
    for i in range(k):
        max_score = max(new_docID_score,key=lambda x:x[1])
        k_best.append(max_score)
        new_docID_score.remove(max_score)
    return k_best