# Definitions

In [None]:
# !git clone https://github.com/Text-Mining/Useful-Corpora-for-Text-Mining-in-Persian-Language.git
# !unrar x '/content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part01.rar'
!pip install hazm

In [None]:
import pandas as pd
import numpy as np
import json
import ast
import math
from scipy import spatial
from threading import Thread
import hazm as hzm
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
def normalizer(data):
    """
    data: a row of the dataframe
    """
    normalizer = hzm.Normalizer()
    return normalizer.normalize(data)


def stemmer(data):
    stemmer = hzm.Stemmer()
    stem_list = []
    for i in data:
        stem_list.append(stemmer.stem(i))
    return stem_list


def lemma(text_tokens):
    lemmatizer = hzm.Lemmatizer()
    temp = []
    for word in text_tokens:
        temp.append(lemmatizer.lemmatize(word))

    return temp


def removeStopWords(text_tokens):
    tokens_without_sw = [
        word for word in text_tokens if not word in hzm.stopwords_list()
    ]
    return tokens_without_sw


def remove_punctuations(text_tokens):
    punctuations_list = [
        "،",
        ".",
        ":",
        "؛",
        "؟",
        "!",
        "'",
        "\\",
        "/",
        "-",
        "ـ",
        "+",
        "=",
        "*",
        ",",
        "٪",
        "$",
        "#",
        "@",
        "÷",
        "<",
        ">",
        "|",
        "}",
        "{",
        "[",
        "]",
        ")",
        "(",
        "…",
    ]
    delimiters_list = [
        "،",
        ".",
        ":",
        "؛",
        "؟",
        "!",
        "'",
        "\\",
        "/",
        "-",
        "ـ",
        ",",
        "|",
        "}",
        "{",
        "[",
        "]",
        ")",
        "(",
        "…",
    ]

    tokens_without_punc = []
    for token in text_tokens:
        if token not in punctuations_list:
            """
            the following for-loop is to replace 
            the punctuations appearing in the middle
            of tokens with a space so we can later
            split the tokens by space and separately
            extract the words
            """
            for delimiter in delimiters_list:
                token = token.replace(delimiter, " ")

            for word in token.split():
                tokens_without_punc.append(word.strip())

    return tokens_without_punc


def preprocess_pipeline(
    df,
    normalize_flag=True,
    remove_stop_words_flag=False,
    remove_punctuations_flag=False,
    lemmatize_flag=False,
    stemmer_flag=False,
    show_logs=False,
):
    """
    input text 
        ↳ [normalize]
            ↳ tokenize
                ↳ [remove punctuations] 
                    ↳ [remove stop words]
                        ↳ [lemmatize]
                            ↳ [stemmer]
                                ↳ output text
    """
    df["preprocessed"] = None
    for index in df.index:
        text = df.loc[index, "NewsBody"]
        if normalize_flag:
            text = normalizer(df["NewsBody"][index])

        text_tokens = hzm.word_tokenize(text)

        if remove_punctuations_flag:
            text_tokens = remove_punctuations(text_tokens)

        if remove_stop_words_flag:
            text_tokens = removeStopWords(text_tokens)

        if lemmatize_flag:
            text_tokens = lemma(text_tokens)

        if stemmer_flag:
            text_tokens = stemmer(text_tokens)
        
        df["preprocessed"][index] = "/".join(text_tokens)

        if show_logs:
            print(f"Preprocessed {index}")

    return df


def invert_indexing(df):
    terms = []
    inverted_index = {
        "Term": [],
        "DocID_Ferquency": []
    }

    for index in df.index:
        text_tokens = df.loc[index, "preprocessed"]
        terms.extend(list(set(text_tokens.split("/"))))
    
    terms = set(terms)
    
    print(len(terms))

    kl = 0
    for token in terms:
        each_term_per_document_frequency = {}
        for index in df.index:
            text_tokens = df.loc[index, "preprocessed"]
            news_body_array = text_tokens.split("/")
            if(token in set(news_body_array)):
                count = news_body_array.count(token)
                # each_term_per_document_frequency.setdefault(index, 0)
                each_term_per_document_frequency[index] = count
        inverted_index["Term"].append(token)
        inverted_index["DocID_Ferquency"].append(each_term_per_document_frequency)                
        # if show_logs:
        kl += 1
        print(f"Inverted indexing {(kl/len(terms)*100)} %")

    return inverted_index



def retrieve_documents(preprocessed_df, inverted_index, query):
    docs_titles = []
    docs_index = inverted_index.get(query, [])
    for doc_index in docs_index:
        docs_titles.append(preprocessed_df.loc[doc_index, "title"])

    return docs_titles


def get_query(preprocessed_df, inverted_index, args):
    query = input("Enter your query: ").strip()
    while query != "":
        # the end condition is when the
        # user enters an empty string
        output = []
        query_df = pd.DataFrame({"content": [query], "preprocessed": [""],})
        preprocessed_query = preprocess_pipeline(query_df, True, True, True, True, True, True)
        processed_query = preprocessed_query[0]["preprocessed"][0]
        print(f"Processed query: {processed_query}")

        docs_titles = retrieve_documents(preprocessed_df, inverted_index, query)
        print(docs_titles)
        print(f"Retrieved {len(docs_titles)} documents")

        query = input("\nEnter your query: ").strip()


def term_frequency(method, word_dictionary):
    if(method == 'n'):
        return word_dictionary['term']['doc_Id']
    if(method == 'l'):
        return (1 + math.log(word_dictionary['term']['doc_Id'], 10))
    if(method == 'a'):
        return (0.5 + ((0.5 * word_dictionary['term']['doc_Id'])/(word_dictionary['term']['*']).max))
    # if(method == ''):
    #     return ()



def tf_idf (word_dictionary, term, doc_Id, number_of_documents):
    tf = term_frequency(word_dictionary)
    idf = math.log(number_of_documents/len(word_dictionary['term']), 10)
    return tf * idf 

def boolean_model (word_dictionary, number_of_documents):
    existance_matrix = np.zero(np.zeros((len(word_dictionary['term']), number_of_documents)))
    for i, token in enumerate(word_dictionary['term']):
        for j, doc_Id in enumerate(word_dictionary['doc_Id'] in keys):
            existance_matrix[i, j] = 1
    
    return existance_matrix


def ranking(vector_query, vector_doc_tf_idf, k):
    cosine_distances = []

    for i in range(vector_doc_tf_idf.shape[0]):
        try:
            z = spatial.distance.cosine(vector_doc_tf_idf[i], vector_query)
            cosine_distances.append(z.item())
        except Exception as e:
            print(e)

    cosine_distances = np.array(cosine_distances)

    top_matches_indices = np.argsort(cosine_distances, axis=0)[:k]
    return top_matches_indices, cosine_distances[top_matches_indices]




# Codes

In [None]:
data_dic = {
    "Id": [],
    "CategoryEn1": [],
    "CategoryFa1": [],
    "CategoryEn2": [],
    "CategoryFa2": [],
    "NewsDate": [],
    "NewsTitle": [],
    "NewsSummary": [],
    "NewsBody": [],
}

comment = {
    "newsID": [],
    "id": [],
    "parentID": [],
    "parentName": [],
    "name": [],
    "createDate": [],
    "persianCreateDate": [],
    "text": [],
}

id_news = 0
with open('/content/farsnews.json', encoding='utf-8-sig') as document:
    for line in document:
        # data = json.loads(line)
        # print(data["NewsTitle"])
        # x = data["NewsTitle"].encode('utf-8-sig')
        # z = x.decode('utf-8')
        # print(z)
        # break
        # print(z['CategoryPanel'])
        
        data = json.loads(line)
        # print(data)
        # # data = data.decode('utf-8')

        # # ID
        data_dic["Id"].append(id_news)
        

        # Category panel
        try:
            cp = data["CategoryPanel"]

            data_dic['CategoryEn1'].append(cp[0]["CategoryEn"].encode('utf-8-sig').decode('utf-8'))
            data_dic['CategoryEn2'].append(cp[1]["CategoryEn"].encode('utf-8-sig').decode('utf-8'))

            data_dic['CategoryFa1'].append(cp[0]["CategoryFa"].encode('utf-8-sig').decode('utf-8'))
            data_dic['CategoryFa2'].append(cp[1]["CategoryFa"].encode('utf-8-sig').decode('utf-8'))

        except:
            try:
                ce = data["CategoryEn"].encode('utf-8-sig').decode('utf-8')
                cf = data["CategoryFa"].encode('utf-8-sig').decode('utf-8')
                data_dic['CategoryEn1'].append(ce)
                data_dic['CategoryFa1'].append(cf)

                data_dic['CategoryEn2'].append("None")
                data_dic['CategoryFa2'].append("None")
            except:
                cp = "None"
                data_dic['CategoryEn1'].append(cp)
                data_dic['CategoryEn2'].append(cp)
                data_dic['CategoryFa1'].append(cp)
                data_dic['CategoryFa2'].append(cp)


        # News Date
        try:
            newsdate = str(data["NewsDate"]).encode('utf-8-sig').decode('utf-8').rstrip()
            data_dic["NewsDate"].append(newsdate)
        except:
            data_dic["NewsDate"].append("None")
        
        # News Title

        try:
            newstitle = str(data["NewsTitle"].encode('utf-8-sig').decode('utf-8'))
            data_dic["NewsTitle"].append(newstitle)
        except:
            data_dic["NewsTitle"].append("None")
        
        # News Summary
        try:
            newssummery = str(data["NewsSummary"].encode('utf-8-sig').decode('utf-8'))
            data_dic["NewsSummary"].append(newssummery)
        except:
            data_dic["NewsSummary"].append("None")
        

        # News Body
        try:
            newsbody = str(data["NewsBody"].encode('utf-8-sig').decode('utf-8'))
            data_dic["NewsBody"].append(newsbody)
        except:
            data_dic["NewsBody"].append("None")

        # Comment

        # try:
        #     comments = data["GetComments"]["CommentsJsonArray"]
        #     for i in range(0, len(comments)):
        #         comment["newsID"].append(id_news)
        #         comment["id"].append(comment['id'])
        #         comment[]
                
        # except:
        #     pass

        
        
        # Update id news for table
        id_news = id_news + 1


        # Show Status

        if id_news % 200 == 0:
            # print(data_dic)
            print(f"{id_news} completed!")


        
# print(data_dic)
# print("----")
data_frame = pd.DataFrame(data_dic)
data_frame.to_csv("News.csv")

In [None]:
datas = pd.read_csv("/content/data.csv")
datas.head(10)

In [None]:
news = pd.read_csv("/content/gdrive/MyDrive/News.csv")
x = preprocess_pipeline(news)
# data_frame.to_csv("News.csv")

In [None]:
data_dic = invert_indexing(x, True)
data_frame = pd.DataFrame(data_dic)
data_frame.to_csv("News.csv")

In [None]:
news = pd.read_csv("/content/gdrive/MyDrive/PreProcessedNews.csv")
news = news.iloc[:1000,:]
data_dic = invert_indexing(news)

data_frame = pd.DataFrame().from_dict(data_dic)
data_frame.to_csv("/content/gdrive/MyDrive/Posting_Lists.csv")

In [None]:
news = pd.read_csv("/content/gdrive/MyDrive/DocIPosting_Lists.csv")
news.head(100)
