# Importing packages

In [1]:

!pip install -qU hazm


!mkdir resources
!wget -q "https://github.com/sobhe/hazm/releases/download/v0.5/resources-0.5.zip" -P resources
!unzip -qq resources/resources-0.5.zip -d resources

!pip install faiss-cpu

!rm -rf /content/4ccae468eb73bf6c4f4de3075ddb5336
!rm -rf /content/preproc
!rm preprocessing.py utils.py
!mkdir -p /content/preproc
!git clone https://gist.github.com/4ccae468eb73bf6c4f4de3075ddb5336.git /content/preproc/
!mv /content/preproc/* /content/
!rm -rf /content/preproc




[K     |████████████████████████████████| 317kB 9.0MB/s 
[K     |████████████████████████████████| 1.4MB 15.8MB/s 
[K     |████████████████████████████████| 235kB 30.6MB/s 
[?25h  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
Collecting faiss-cpu
[?25l  Downloading https://files.pythonhosted.org/packages/1e/a8/ed1601e6e94702ad691465bd1bead221dd2984f741bf384011b4dc59130e/faiss_cpu-1.7.0-cp36-cp36m-manylinux2014_x86_64.whl (8.1MB)
[K     |████████████████████████████████| 8.2MB 9.6MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.0
rm: cannot remove 'preprocessing.py': No such file or directory
rm: cannot remove 'utils.py': No such file or directory
Cloning into '/content/preproc'...
remote: Enumerating objects: 7, done.[K
remote: Total 7 (delta 0), reused 0 (delta 0), pack-reused 7[K
Unpacking objects: 100% (7/7), done.


In [3]:
import numpy as np 
import pandas as pd
import re
from tqdm import tqdm 
import os
# import yake
from hazm import stopwords_list
from __future__ import unicode_literals
from hazm import *
import pickle
import requests
from termcolor import colored

import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import plotly.express as px
import plotly.graph_objects as go
from itertools import chain


# Loading the dataset


In [4]:
from google.colab import drive
drive.mount('/content/drive')

data_address = '/content/drive/MyDrive/COVID-PSS.xls'
keys_address = '/content/drive/MyDrive/keywords_final_distilled_NE (1).pickle'
cleaned_titles_address = '/content/drive/MyDrive/title_cleaned_without_corona_2.pkl'

df = pd.read_csv(data_address)
list_t = pd.read_pickle(cleaned_titles_address)

keywords = pd.read_pickle(keys_address)
keywords = [v for k,v in keywords.items()]



assert len(keywords) == len(df)
df['keywords'] = keywords
df.drop(columns=['img', 'link'], inplace=True)
bm_selected = pd.read_pickle('/content/drive/MyDrive/CoPer paper-Models/Results/BM25_PrimaryRanking.pkl')

questions = pd.read_pickle('/content/drive/MyDrive/CoPer paper-Models/Sample Queries/Titles_with_Corona.pkl')


Mounted at /content/drive


In [5]:
top_n = 20

# Helper

In [6]:
def preprocess(title, body=None):
    """ Preprocess the input, i.e. lowercase, remove html tags, special character and digits."""
    text = ''
    if body is None:
        text = title
    else:
        text = title + ' ' + body

    return text
    # we don't need this one



def create_tfidf_features(corpus, max_features=5000, max_df=0.95, min_df=2):
    """ Creates a tf-idf matrix for the `corpus` using sklearn. """
    tfidf_vectorizor = TfidfVectorizer(decode_error='replace', strip_accents='unicode', analyzer='word',
                                       ngram_range=(1, 1),
                                       norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
                                       max_df=max_df, min_df=min_df)
    
    
    # add tokenizer = bert_tokenizer
    # also I don't think we need min_df
    
    
    
    X = tfidf_vectorizor.fit_transform(corpus)
    print('tfidf matrix successfully created.')
    return X, tfidf_vectorizor


def calculate_similarity(X, vectorizor, query, top_k):
    """ Vectorizes the `query` via `vectorizor` and calculates the cosine similarity of
    the `query` and `X` (all the documents) and returns the `top_k` similar documents."""

    # Vectorize the query to the same length as documents
    query_vec = vectorizor.transform(query)
    # Compute the cosine similarity between query_vec and all the documents
    cosine_similarities = cosine_similarity(X, query_vec).flatten()
    # Sort the similar documents from the most similar to less similar and return the indices
    most_similar_doc_indices = np.argsort(cosine_similarities, axis=0)[:-top_k - 1:-1]
    return (most_similar_doc_indices, cosine_similarities)


def show_similar_documents(df_inp, cosine_similarities, similar_doc_indices):
    """ Prints the most similar documents using indices in the `similar_doc_indices` vector."""
    counter = 1
    for index in similar_doc_indices:
        print('Index = {}, Top = {}, Similarity = {}'.format(index, counter, cosine_similarities[index]))
        print('title: {}'.format(df_inp.iloc[index]['title']))
        print('=' * 100)
        counter += 1




def get_results(df, questions, ranked=False,  top_n=50):
    """Gets a dataframe and seiries of questions and creates a matrix and
     outputs the similar record indices and scores """

    results = []
    
    # print(len(features))
    if ranked ==False:
        for i in range(len(questions)):
            data = []
            for j in range(len(df)):
                each = df.iloc[j]
                title = each['title']
                body = each['body']
                data.append(preprocess(title, body))

            print('creating tfidf matrix...', len(data))
            X, v = create_tfidf_features(data)
            features = v.get_feature_names()
            user_question = [questions[i]]
            sim_vecs, cosine_similarities = calculate_similarity(X, v, user_question, top_k=top_n)
            results.append({'question':questions[i],
                            'index':sim_vecs})
            
    else:
        for i in range(len(questions)):
            # check that questions match
            assert bm_selected[i]['question'] == questions[i]
            # make a df out of the selected indices for that question
            df_primary = df.loc[bm_selected[i]['index']]
        
            data = []
            # for each question take out the dataset that is related with bm25 
            # and concatenate the title and body then make the tfidf matrix for
            # each
            for j in range(len(df_primary)):
                each = df_primary.iloc[j]
                title = each['title']
                body = each['body']
                data.append(preprocess(title, body))

            print('creating tfidf matrix...', len(data))
            X, v = create_tfidf_features(data)
            features = v.get_feature_names()
            user_question = [questions[i]]
            sim_vecs, cosine_similarities = calculate_similarity(X, v, user_question, top_k=len(bm_selected[i]['index']))
            results.append({'question':questions[i],
                            'score':cosine_similarities,
                            'index':sim_vecs})

    return results


In [None]:
len(bm_selected[0]['index'])

3517

# Normal TFIDF

In [None]:
tfidf_plain_results = get_results(df, questions, ranked=False, top_n=top_n)

creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 3536
tfidf matrix successfully created.
creating tfidf matrix... 

# Ranked TFIDF

In [7]:
tfidf_ranked_results = get_results(df, questions, ranked=True)

creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3516
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3516
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 3517
tfidf matrix successfully created.
creating tfidf matrix... 