In [1]:
import json
from parsivar import Normalizer, Tokenizer, FindStems
from stopwordsiso import stopwords

In [2]:
# ---------------------------- preprocessing ----------------------------
# normalizer
my_normalizer = Normalizer()
# tokenizer
my_tokenizer = Tokenizer()
# stemmer
my_stemmer = FindStems()
# stop words
persian_stopwords = stopwords("fa")

In [3]:
# opening JSON file
f = open('IR_data_news_small.json')
# returns JSON object as a dictionary
documents = json.load(f)
# closing file
f.close()

In [4]:
positional_index = {}

# iterating through the json list
for docID in documents:
    # normalize
    normal = my_normalizer.normalize(documents[docID]["content"])
    # tokenize
    token_normal = my_tokenizer.tokenize_words(normal)
    # remove stopwords
    stopword_token_normal = []
    for t in token_normal:
        if t not in persian_stopwords:
            stopword_token_normal.append(t)
    # stemming
    stem_stopword_token_normal = [my_stemmer.convert_to_stem(w) for w in stopword_token_normal]

    # --------------------------- positional index --------------------------
    # creating positional index
    docLen = len(stem_stopword_token_normal)
    for pos in range(docLen):
        term = stem_stopword_token_normal[pos]
        if term not in positional_index: # first visit of this term in all documents
            positional_index[term] = {'tot_freq': 1, docID: {'doc_freq': 1, 'positions': [pos]}}
        else:
            positional_index[term]['tot_freq'] += 1
            if docID not in positional_index[term]: # first visit of this term in this document 
                positional_index[term][docID] = {'doc_freq': 1, 'positions': [pos]}
            else: # not first visit of this term in this document
                positional_index[term][docID]['doc_freq'] += 1
                positional_index[term][docID]['positions'].append(pos)

In [5]:
query = input('کوئری خود را وارد کنید: ')
print(query)

کوئری خود را وارد کنید: کشور
کشور


13
