# Definitions

In [None]:
# !git clone https://github.com/Text-Mining/Useful-Corpora-for-Text-Mining-in-Persian-Language.git
# !unrar x '/content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part01.rar'
!pip install hazm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 14.7 MB/s 
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 59.8 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 61.7 MB/s 
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394487 sha256=59cd0906adc04d7a0ac4c3aa8711113213eb1414d0b7c9cd54cd3ba2a7e6fb47
  Stored in directory: /root/.cache/pip/wheels/9b/fd/0c/d92302c876e5de87ebd7fc0979d82edb93e2d8d768bf71fac4
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp37-cp37m-linux_x86

In [None]:
import pandas as pd
import numpy as np
import json
import ast
import math
from scipy import spatial
from threading import Thread
import hazm as hzm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
def normalizer(data):
    """
    data: a row of the dataframe
    """
    normalizer = hzm.Normalizer()
    return normalizer.normalize(data)


def stemmer(data):
    stemmer = hzm.Stemmer()
    stem_list = []
    for i in data:
        stem_list.append(stemmer.stem(i))
    return stem_list


def lemma(text_tokens):
    lemmatizer = hzm.Lemmatizer()
    temp = []
    for word in text_tokens:
        temp.append(lemmatizer.lemmatize(word))

    return temp


def removeStopWords(text_tokens):
    tokens_without_sw = [
        word for word in text_tokens if not word in hzm.stopwords_list()
    ]
    return tokens_without_sw


def remove_punctuations(text_tokens):
    punctuations_list = [
        "،",
        ".",
        ":",
        "؛",
        "؟",
        "!",
        "'",
        "\\",
        "/",
        "-",
        "ـ",
        "+",
        "=",
        "*",
        ",",
        "٪",
        "$",
        "#",
        "@",
        "÷",
        "<",
        ">",
        "|",
        "}",
        "{",
        "[",
        "]",
        ")",
        "(",
        "…",
    ]
    delimiters_list = [
        "،",
        ".",
        ":",
        "؛",
        "؟",
        "!",
        "'",
        "\\",
        "/",
        "-",
        "ـ",
        ",",
        "|",
        "}",
        "{",
        "[",
        "]",
        ")",
        "(",
        "…",
    ]

    tokens_without_punc = []
    for token in text_tokens:
        if token not in punctuations_list:
            """
            the following for-loop is to replace 
            the punctuations appearing in the middle
            of tokens with a space so we can later
            split the tokens by space and separately
            extract the words
            """
            for delimiter in delimiters_list:
                token = token.replace(delimiter, " ")

            for word in token.split():
                tokens_without_punc.append(word.strip())

    return tokens_without_punc


def preprocess_pipeline(
    df,
    normalize_flag=True,
    remove_stop_words_flag=False,
    remove_punctuations_flag=False,
    lemmatize_flag=False,
    stemmer_flag=False,
    show_logs=False,
):
    """
    input text 
        ↳ [normalize]
            ↳ tokenize
                ↳ [remove punctuations] 
                    ↳ [remove stop words]
                        ↳ [lemmatize]
                            ↳ [stemmer]
                                ↳ output text
    """
    df["preprocessed"] = None
    for index in df.index:
        text = df.loc[index, "NewsBody"]
        if normalize_flag:
            text = normalizer(df["NewsBody"][index])

        text_tokens = hzm.word_tokenize(text)

        if remove_punctuations_flag:
            text_tokens = remove_punctuations(text_tokens)

        if remove_stop_words_flag:
            text_tokens = removeStopWords(text_tokens)

        if lemmatize_flag:
            text_tokens = lemma(text_tokens)

        if stemmer_flag:
            text_tokens = stemmer(text_tokens)
        
        df["preprocessed"][index] = "/".join(text_tokens)

        if show_logs:
            print(f"Preprocessed {index}")

    return df


def token_invert_indexing(df, tokens):
    inverted_index = {
        "Term": [],
        "Total_Ferquency":[],
        "DocID_Ferquency": []
    }
    text_tokens = df.loc[0, "preprocessed"]
    tokens = text_tokens.split("/")
    for token in tokens:
        each_term_per_document_frequency = {}
        sum = 0
        for index in df.index:
            text_tokens = df.loc[index, "preprocessed"]
            news_body_array = text_tokens.split("/")
            if(token in set(news_body_array)):
                count = news_body_array.count(token)
                each_term_per_document_frequency[index] = count
                sum += count
        if token.replace(" ", "") != "" :
            inverted_index["Term"].append(token)
            inverted_index["ُTotal_Ferquency"].append(sum)
            inverted_index["DocID_Ferquency"].append(each_term_per_document_frequency) 
    return inverted_index


def invert_indexing(df):
    terms = []
    inverted_index = {
        "Term": [],
        "Total_Ferquency":[],
        "DocID_Ferquency": []
    }

    for index in df.index:
        text_tokens = df.loc[index, "preprocessed"]
        terms.extend(list(set(text_tokens.split("/"))))
    
    terms = set(terms)
    
    print(len(terms))

    kl = 0
    for token in terms:
        each_term_per_document_frequency = {}
        sum = 0
        for index in df.index:
            text_tokens = df.loc[index, "preprocessed"]
            news_body_array = text_tokens.split("/")
            if(token in set(news_body_array)):
                count = news_body_array.count(token)
                each_term_per_document_frequency[index] = count
                sum += count
        if token.replace(" ", "") != "" :
            inverted_index["Term"].append(token)
            inverted_index["Total_Ferquency"].append(sum)
            inverted_index["DocID_Ferquency"].append(each_term_per_document_frequency)                
        # if show_logs:
        kl += 1
        print(f"Inverted indexing {(kl/len(terms)*100)} %")

    return inverted_index



def retrieve_documents(preprocessed_df, inverted_index, query, weighting_model):
    docs_titles = []
    docs_index = inverted_index.get(query, [])
    for doc_index in docs_index:
        docs_titles.append(preprocessed_df.loc[doc_index, "NewsTitle"])

    return docs_titles


def get_query(preprocessed_df, inverted_index, number_of_documents):
    query = input("Enter your query: ").strip()
    weighting_model = input("Enter your Weighting model: (e.g. : ddd.qqq) ").strip()
    if(weighting_model == ""):
        weighting_model = "lnc.ltc" # default weighting model
    k = int(input("How many Document do you want to Retrive : ").strip())
    while query != "":
        # the end condition is when the
        # user enters an empty string for query
        output = []

        doc_tf_method = weighting_model[0]
        doc_df_method = weighting_model[1]
        doc_norm_method = weighting_model[2]

        query_tf_method = weighting_model[4]
        query_df_method = weighting_model[5]
        query_norm_method = weighting_model[6]

        query_df = pd.DataFrame({"NewsBody": [query], "preprocessed": [""],})
        preprocessed_query = preprocess_pipeline(query_df, True, True, True, True, True, True)
        processed_query = preprocessed_query.loc[0, "preprocessed"]



        matrix_vectorization = TfidfVectorizer()
        transformed_matrix = matrix_vectorization.fit_transform(preprocessed_df['preprocessed'].append(pd.Series([processed_query], name="preprocessed")))

        similarity = cosine_similarity(transformed_matrix[-1], transformed_matrix)[0][:-1]
        maxim_array = np.argmax(similarity)
        guess = 
        for i in maxim_array :
            print(i, " :::::: ", preprocessed_df.iloc[i])
            print("----------------------------------------------------------")
            # similarity = np.argwhere(maxim > x)


        query_invert_indexing = token_invert_indexing(preprocessed_df, processed_query)
        # query_tokens_array = processed_query.split("/")
        # for term in query_tokens_array:
        #     tf_idf (inverted_index, term, doc_Id, number_of_documents, tf_method, df_method)

        # print(f"Processed query: {processed_query}")

        # docs_titles = retrieve_documents(preprocessed_df, inverted_index, query, weighting_model)
        # print(docs_titles)
        # print(f"Retrieved {len(docs_titles)} documents")

        query = input("\nEnter your query: ").strip()
        if(query != ""):
            weighting_model = input("Enter your Weighting model: (e.g. : ddd.qqq) ").strip()
            if(weighting_model == ""):
                weighting_model = "lnc.ltc" # default weighting model
            k = int(input("How many Document do you want to Retrive : ").strip())



def term_frequency(method, word_dictionary):
    if(method == 'n'):
        return word_dictionary['term']['doc_Id']
    if(method == 'l'):
        return (1 + math.log(word_dictionary['term']['doc_Id'], 10))
    if(method == 'a'):
        return (0.5 + ((0.5 * word_dictionary['term']['doc_Id'])/(word_dictionary['term']['*']).max))
    if(method == 'b'):
        if word_dictionary['term']['doc_Id'] > 1 :
            return 1
        else :
            return 0
    if(method == 'L'):
        ave = word_dictionary['term']['*']
        return ((1 + math.log(word_dictionary['term']['doc_Id'], 10))/(1 + math.log(ave , 10)))

def inverse_document_frequency(method, word_dictionary, number_of_documents):
    if(method == 'n'):
        return 1
    if(method == 't'):
        return math.log(number_of_documents/len(word_dictionary['term']), 10)
    if(method == 'p'):
        return max(0, math.log(number_of_documents - (len(word_dictionary['term'])) / len(word_dictionary['term'] )))

def normalization(method, w):
    if(method == 'n'):
        return 1
    if(method == 'c'):
        sum = 0
        for i in range(len(w)):
            sum += w[i] ** 2
        return (1 / math.sqrt(sum))
    if(method == 'u'):
        pivot = 5
        return (1 / pivot)
    if(method == 'b'):
        return 1
    

def tf_idf (word_dictionary, term, doc_Id, number_of_documents, tf_method, df_method):
    tf = term_frequency(tf_method, word_dictionary)
    idf = inverse_document_frequency(df_method, word_dictionary, number_of_documents)
    return tf * idf 

def boolean_model (word_dictionary, number_of_documents):
    existance_matrix = np.zero(np.zeros((len(word_dictionary['term']), number_of_documents)))
    for i, token in enumerate(word_dictionary['term']):
        for j, doc_Id in enumerate(word_dictionary['doc_Id'] in keys):
            existance_matrix[i, j] = 1
    
    return existance_matrix


def ranking(vector_query, vector_doc_tf_idf, normalization_method, k):
    cosine_distances = []

    for i in range(vector_doc_tf_idf.shape[0]):
        try:
            z = spatial.distance.cosine(vector_doc_tf_idf[i], vector_query)
            cosine_distances.append(z.item())
        except Exception as e:
            print(e)

    cosine_distances = np.array(cosine_distances)

    top_matches_indices = np.argsort(cosine_distances, axis=0)[:k]
    return top_matches_indices, cosine_distances[top_matches_indices]




# Codes

Converting json news file to csv file

In [None]:
data_dic = {
    "Id": [],
    "CategoryEn1": [],
    "CategoryFa1": [],
    "CategoryEn2": [],
    "CategoryFa2": [],
    "NewsDate": [],
    "NewsTitle": [],
    "NewsSummary": [],
    "NewsBody": [],
}

comment = {
    "newsID": [],
    "id": [],
    "parentID": [],
    "parentName": [],
    "name": [],
    "createDate": [],
    "persianCreateDate": [],
    "text": [],
}

id_news = 0
with open('/content/farsnews.json', encoding='utf-8-sig') as document:
    for line in document:
        
        data = json.loads(line)
        # ID
        data_dic["Id"].append(id_news)
        
        # Category panel
        try:
            cp = data["CategoryPanel"]

            data_dic['CategoryEn1'].append(cp[0]["CategoryEn"].encode('utf-8-sig').decode('utf-8'))
            data_dic['CategoryEn2'].append(cp[1]["CategoryEn"].encode('utf-8-sig').decode('utf-8'))

            data_dic['CategoryFa1'].append(cp[0]["CategoryFa"].encode('utf-8-sig').decode('utf-8'))
            data_dic['CategoryFa2'].append(cp[1]["CategoryFa"].encode('utf-8-sig').decode('utf-8'))

        except:
            try:
                ce = data["CategoryEn"].encode('utf-8-sig').decode('utf-8')
                cf = data["CategoryFa"].encode('utf-8-sig').decode('utf-8')
                data_dic['CategoryEn1'].append(ce)
                data_dic['CategoryFa1'].append(cf)

                data_dic['CategoryEn2'].append("None")
                data_dic['CategoryFa2'].append("None")
            except:
                cp = "None"
                data_dic['CategoryEn1'].append(cp)
                data_dic['CategoryEn2'].append(cp)
                data_dic['CategoryFa1'].append(cp)
                data_dic['CategoryFa2'].append(cp)


        # News Date
        try:
            newsdate = str(data["NewsDate"]).encode('utf-8-sig').decode('utf-8').rstrip()
            data_dic["NewsDate"].append(newsdate)
        except:
            data_dic["NewsDate"].append("None")
        
        # News Title

        try:
            newstitle = str(data["NewsTitle"].encode('utf-8-sig').decode('utf-8'))
            data_dic["NewsTitle"].append(newstitle)
        except:
            data_dic["NewsTitle"].append("None")
        
        # News Summary
        try:
            newssummery = str(data["NewsSummary"].encode('utf-8-sig').decode('utf-8'))
            data_dic["NewsSummary"].append(newssummery)
        except:
            data_dic["NewsSummary"].append("None")
        

        # News Body
        try:
            newsbody = str(data["NewsBody"].encode('utf-8-sig').decode('utf-8'))
            data_dic["NewsBody"].append(newsbody)
        except:
            data_dic["NewsBody"].append("None")

        # Comment

        # try:
        #     comments = data["GetComments"]["CommentsJsonArray"]
        #     for i in range(0, len(comments)):
        #         comment["newsID"].append(id_news)
        #         comment["id"].append(comment['id'])
        #         comment[]
                
        # except:
        #     pass


        # Update id news for table
        id_news = id_news + 1

        # Show Status
        if id_news % 200 == 0:
            print(f"{id_news} completed!")

data_frame = pd.DataFrame(data_dic)
data_frame.to_csv("News.csv")

In [None]:
news_pd = pd.read_csv("/content/gdrive/MyDrive/News.csv")
x = preprocess_pipeline(news_pd)
data_dic = invert_indexing(x, True)
data_frame = pd.DataFrame(data_dic)
data_frame.to_csv("/content/gdrive/MyDrive/PreProcessedNews.csv")

Creating Inverted Index

In [None]:
news_df = pd.read_csv("/content/gdrive/MyDrive/PreProcessedNews.csv")
news_df = news_df.iloc[:1000,:]
inverted_index_dict = invert_indexing(news_df)

inverted_index_df = pd.DataFrame().from_dict(inverted_index_dict)
inverted_index_df.to_csv("/content/gdrive/MyDrive/Posting_Lists.csv")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Inverted indexing 78.15122377622377 %
Inverted indexing 78.15559440559441 %
Inverted indexing 78.15996503496504 %
Inverted indexing 78.16433566433567 %
Inverted indexing 78.1687062937063 %
Inverted indexing 78.17307692307692 %
Inverted indexing 78.17744755244756 %
Inverted indexing 78.18181818181819 %
Inverted indexing 78.1861888111888 %
Inverted indexing 78.19055944055944 %
Inverted indexing 78.19493006993007 %
Inverted indexing 78.1993006993007 %
Inverted indexing 78.20367132867133 %
Inverted indexing 78.20804195804196 %
Inverted indexing 78.21241258741259 %
Inverted indexing 78.21678321678321 %
Inverted indexing 78.22115384615384 %
Inverted indexing 78.22552447552448 %
Inverted indexing 78.2298951048951 %
Inverted indexing 78.23426573426573 %
Inverted indexing 78.23863636363636 %
Inverted indexing 78.24300699300699 %
Inverted indexing 78.24737762237763 %
Inverted indexing 78.25174825174825 %
Inverted indexing 78.256118

Showing inverted index

In [None]:
# inverted_index_df = pd.read_csv("/content/gdrive/MyDrive/Posting_Lists.csv")
news_df.iloc[284652]["NewsBody"]


'\ufeffمسوول فضای مجازی سراج چهارمحال و بختیاری گفت: فریب عملیات روانی دشمن را نخوریم.'

Runing Query and Retrive

In [None]:
#running IR
# news_df = pd.read_csv("/content/gdrive/MyDrive/PreProcessedNews.csv")
# inverted_index_df = pd.read_csv("/content/gdrive/MyDrive/Posting_Lists.csv")
get_query(news_df, inverted_index_df, 1000)
# inverted_index_df.head(10)

Enter your query: فریب
Enter your Weighting model: (e.g. : ddd.qqq) 
How many Document do you want to Retrive : 10
Preprocessed 0
0  ::::::  0         ﻿محمد نوری در گفت\r\nوگو\r\nبا خبرنگار ورزشی خ...
1         ﻿به گزارش گروه بین‌الملل خبرگزاری فارس، با امض...
2         ﻿به گزارش خبرگزاری فارس از شیراز، سیدابراهیم ح...
3         ﻿حجت‌الاسلام جعفر جنتی امروز در گفت‌وگو با خبر...
4         ﻿به گزارش خبرنگار گروه علمی و دانشگاهی خبرگزار...
                                ...                        
285690    ﻿به گزارش خبرگزاری فارس از همدان، آیت‌الله غیا...
285691    ﻿به گزارش خبرگزاری فارس، علیرضا جهانبخش پس از ...
285692    ﻿به گزارش خبرگزاری فارس از قرچک، حجت‌الاسلام و...
285693    ﻿به گزارش خبرگزاری فارس از پاکدشت، حجت‌الاسلام...
285694    ﻿احمدامیرآبادی فراهانی در گفت‌وگو با خبرنگار ف...
Name: NewsBody, Length: 285695, dtype: object
----------------------------------------------------------


NameError: ignored