In [17]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading metadata.csv file into pandas dataframe
covid_df = pd.read_csv("metadata\metadata.csv",low_memory=False)
covid_df.head(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


In [9]:
## Selecting 1300 docs to start with
docs = covid_df.head(1300)
len(docs)

1300

In [10]:
docs = docs.dropna(axis=1, how='all') # dropping the columns with all missing values
# selecting rows with no abstract missing
docs = docs.iloc[np.where(docs.abstract.notnull())]
docs = docs.iloc[ np.where(docs.sha.notnull())]
docs = docs.iloc[np.where(docs.authors.notnull())]
docs = docs.iloc[np.where(docs.pdf_json_files.notnull())]
docs = docs.iloc[np.where(docs.pmc_json_files.notnull())]
print("Number of documents without any missing values : ",len(docs))

Number of documents without any missing values :  1186


In [11]:
# Using 1000 documents for IR system1
docs = docs.head(1000) 
docs.reset_index(drop=True,inplace=True)

In [12]:
docs.head(2)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,pdf_json_files,pmc_json_files,url
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...


In [21]:
# Using same 1000 documents for IR system2
docs2 = docs.head(1000) # These 1000 documents are indexed using Logstash
docs2.reset_index(drop=True,inplace=True)

In [23]:
docs2.columns

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal',
       'pdf_json_files', 'pmc_json_files', 'url', 'text', 'text_lemmatized'],
      dtype='object')

In [41]:
lemma= WordNetLemmatizer() # initializing object of WordNetLemmatizer for lemmatization
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english')) # taking all the unique English stopwords from nltk corpus


def preprocess_IR1(text):
    text = re.sub('[^a-zA-Z]',' ',text) # removing numbers and punctuations
    text = str(text).lower()            # convert all characters into lowercase
    text = word_tokenize(text)          # tokenization 
    text = [item for item in text if item not in stop_words] # removing stopwords
    text = [lemma.lemmatize(word=w,pos='v') for w in text]   # lemmatization
    text = [i for i in text if len(i) > 2]   # removing token of length <=2
    text = ' '.join(text)                    # joining the tokens with space in between to form sentence

    return text
def preprocess_IR2(text):
    text = re.sub('[^a-zA-Z]',' ', text) 
    text = str(text).lower()           
    text = word_tokenize(text)         
    text = [item for item in text if item not in stop_words] 
    text = [stemmer.stem(token) for token in text]            # Stemming
    text = [i for i in text if len(i) > 2]   
    text = ' '.join(text)                   

    return text

def cosine_sim(a, b):
    """This function returns the cosine similairty between two count vectors"""
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim[0][0]   

def cosine_similarity(k, query_vec,shape):
    """This function returns the top 'k' documents based on cosine similarity between query and documents"""
    d_cosines = dict()
    
    for i in range(1000):

        d = np.reshape(all_docs[i],(shape,1))
        d = d.toarray()
        query = np.reshape(query_vec,(1,shape))
        d_cosines[i] = (cosine_sim(query, d))

    sorted_values = sorted(d_cosines.values(),reverse=True)[:10]
    sorted_dict = {}
    
    for i in sorted_values:
        for k in d_cosines.keys():
            if d_cosines[k] == i:
                sorted_dict[k] = d_cosines[k]
                break

    return sorted_dict


def similarity(queries,transformed_queries):
    """ This function returns a list of top10 relevant docs for each query """
    for j in range(len(transformed_queries)):
        Q = cosine_similarity(10, transformed_queries[j],shape=transformed_queries.shape[1])
        list_of_relevant_docs = []
    
        for key in Q.keys():
            list_of_relevant_docs.append(key)
      
        #print("relevant docs : ",list_of_relevant_docs,"\n")
              
def rk(actual, predicted, k): 
    #recall = # of retrieval docs that are relevant @k/ total # of relevant docs
    
    k_predicted_items = set(predicted[:k]) # get the first k predicted items

    actual = set(actual) # the correct items
    total_relevant = len(actual & set(predicted))
    
    correct = len(k_predicted_items & actual) # how many of those k predicted items are actual items?

    recall = correct/total_relevant
    
    return recall 

def pk(actual, predicted, k):

    #precision = # of documents that are relevent @k / # of retrieved docs at k
    k_predicted_items = set(predicted[:k]) # get the first k predicted items
    #print(k_predicted_items)

    actual = set(actual) # the correct items
    #print(actual)
    correct = len(k_predicted_items & actual) # how many of those k predicted items are actual items?

    precision = correct/float(k) # precision - what fraction of the items is that?
    
    return precision # return precision at k items

def apk(actual, predicted, k=10): 
    k_predicted_items = set(predicted[:k]) # get the first k predicted items

    actual = set(actual) # the correct items

    correct = len(k_predicted_items & actual) # how many of those k predicted items are actual items?
    precision = 0
    precision += correct/float(k) 
    return float(precision)
        
COLOURS = {"END":'\033[0m',"RED":'\033[30;41m',"GREEN":'\033[30;42m'}
DISPLAY_CORRECT = {True: COLOURS["GREEN"], False: COLOURS["RED"]}

def pk_table(actual, predicted, k):
    print("{:^3} {:^7} {:^10} {:^4}".format("k", "Result", "R@k", "P@k"))
    for k in range(1,min(k, len(predicted))+1):
        rounded_pk = round(pk(actual, predicted,k=k),2)
        rounded_rk = round(rk(actual, predicted,k=k),2)
        is_correct = predicted[k-1] in actual
        print("{:>2} {}{:^7}{} {:>5} {:>5}".format(k, DISPLAY_CORRECT[is_correct],predicted[k-1],COLOURS["END"], rounded_rk, rounded_pk))
    #print("AP", round(apk(k=5),2))
    #print("AP (Average Precision @ 5) :  ", round(apk(actual, predicted, k),3))

        
def judgment_precision_recall(predictions,expected_document_ids,query_list):# runs 3 times
    
    for i in range(len(predictions)): # predictions is a list of 3 arrays so this loop runs 3 times
        print(f"******** Query {i+1}: {query_list[i]} ********\n")
        expected = expected_document_ids[i]
        received = predictions[i]
        print("Actual:",   expected)
        print("Predicted:",    received)
        print(" ")
    
        print("Binary Relevance Judgement for each predicted document in the pool: \n")
        for j in received:
            if j in set(expected):
                 print(f"predicted_doc {j} is relevant")
            else:
                 print(f"predicted_doc {j} is not-relevant")
        print("\n")
        
        # calculate p@5 and R@5
        #print("==========p@5 R@5============:")
        pk_table(expected, received, 10)
        print("\n")
        

In [42]:
       
if __name__ == "__main__":
    
     # list to store the queries
    query_list = []
    
    # list to store the actual docuemnts in which the queries are present
    expected_document_ids = []

    # open the eval.csv file, which contain queries and the document ids in which they are present 
    with open("evaluation_final.csv","r") as f:
        content_list = f.readlines()
        for line in content_list:
            rows = line.strip().split(",")
            #print(rows)
            query = rows[0]
            ids = rows[1:]
            query_list.append(query)
            expected_document_ids.append(ids)
            
    # convert the query list to pandas series and apply the same preprocessing that were applied on original document on to quries
    queries = pd.Series(query_list)
      
            
    docs['text'] = docs['title'] + docs['abstract'] # combining the title and abstract column into a single column called text
    docs2['text'] = docs['title'] + docs['abstract']
    
       
    IR_systems = ['IR1','IR2']

    for system in IR_systems:
        if system == 'IR1':
            print("\t\t-------------------------------- This is IR System 1----------------------------------\t\n")
            print("""This system uses lemmatization as normalization technique""")
            
            docs['text_lemmatized'] = docs['text'].apply(lambda x: preprocess_IR1(x))
            # creating a vocabulary of words
            documents = docs['text_lemmatized'].tolist()
            #print(len(documents))
            
            vectorizer = CountVectorizer()
            token_count_vector = vectorizer.fit_transform(documents) 
            count_vector = token_count_vector.toarray()
            # calculate the IDF by invoking tfidf_transformer.fit(...)
            #print(token_count_vector.shape)

            tfidf = TfidfTransformer(smooth_idf=True,use_idf=True)
            all_docs = tfidf.fit_transform(token_count_vector)
            queries = queries.apply(lambda x: preprocess_IR1(x)) 
            queries = list(queries)
            print("Queries:",queries,'\n')
          
            # transform the list of queries to query-term matrix
            q=vectorizer.transform(queries)
            # Transform the count matrix to a tf or tf-idf representation
            q1 = tfidf.transform(q)
            # convert the tf-idf matrix to array
            transformed_queries = q1.toarray()
            
            similarity(queries,transformed_queries) # calling function

            IR1_dict =  {'Effect of environmental factors on virus':['PMC2837245', 'PMC3265445', 'PMC3294595', 'PMC2909313' ,'PMC2770169', 'PMC2821766','PMC3585141','PMC2797517', 'PMC3339311' ,'PMC1351169'],
                        ' Covid-19 air travel safety': ['PMC3577649' ,'PMC3032737' ,'PMC2950238', 'PMC3314701', 'PMC2939898', 'PMC2813231','PMC1764036', 'PMC2781002', 'PMC3223866', 'PMC2796493'],
                        'Transmission of virus in community': ['PMC2851561', 'PMC2893203','PMC3227662', 'PMC3509329', 'PMC3324376', 'PMC3266138', 'PMC3484124' ,'PMC2206439' ,'PMC3086881' ,'PMC1876810']
                        }
            predictions = []
            for key,value in IR1_dict.items():
                predictions.append(value)
      
            judgment_precision_recall(predictions,expected_document_ids,query_list)

        elif system == 'IR2':
            print("\t\t-------------------------------- This is IR System 2 ----------------------------------\n")
            docs2['text_stemmed'] = docs2['text'].apply(lambda x: preprocess_IR2(x))
            documents = docs2['text_stemmed'].tolist()
            #print(len(documents))
            vectorizer = CountVectorizer()
            token_count_vector = vectorizer.fit_transform(documents) 
            count_vector = token_count_vector.toarray()
           # print(token_count_vector.shape)
            # calculate the IDF by invoking tfidf_transformer.fit(...)

            tfidf = TfidfTransformer(smooth_idf=True,use_idf=True)
            all_docs = tfidf.fit_transform(token_count_vector)
            
            queries = pd.Series(query_list)
            queries = queries.apply(lambda x: preprocess_IR2(x)) 
            # convert the preprocessed queries to a list
            queries = list(queries)
            print("Queries:",queries,'\n')
            # transform the list of queries to query-term matrix
            q=vectorizer.transform(queries)
            #print(q.shape)
            # Transform the count matrix to a tf or tf-idf representation
            q1 = tfidf.transform(q)
            # convert the tf-idf matrix to array
            transformed_queries = q1.toarray()
            #print(transformed_queries.shape)

            
            similarity(queries,transformed_queries)
            
            IR2_dict =  {'Effect of environmental factors on virus': ['PMC2837245', 'PMC3265445' ,'PMC3294595' ,'PMC2909313', 'PMC2821766', 'PMC3585141' ,'PMC2770169', 'PMC1351169', 'PMC2797517' ,'PMC2981509'],
                        ' Covid-19 air travel safety':['PMC3577649', 'PMC3032737' ,'PMC2950238', 'PMC3314701' ,'PMC1764036', 'PMC2912811' ,'PMC2939898' ,'PMC2813231' ,'PMC2823611' ,'PMC2796493'],
                        'Transmission of virus in community': ['PMC2851561', 'PMC3541974', 'PMC3324376' ,'PMC2893203' ,'PMC3227662', 'PMC3509329', 'PMC2204055' ,'PMC3057078', 'PMC3222642' ,'PMC3266138']
                        }
            predictions = []
            for key,value in IR2_dict.items():
                predictions.append(value)
                
            judgment_precision_recall(predictions,expected_document_ids,query_list)


		-------------------------------- This is IR System 1----------------------------------	

This system uses lemmatization as normalization technique
Queries: ['effect environmental factor virus', 'covid air travel safety', 'transmission virus community'] 

******** Query 1: Effect of environmental factors on virus ********

Actual: ['PMC3294595', 'PMC3265445', 'PMC2797517', 'PMC2821766', 'PMC1779785', 'PMC2837245', 'PMC3585141', 'PMC2909313', 'PMC2770169', 'PMC1351169']
Predicted: ['PMC2837245', 'PMC3265445', 'PMC3294595', 'PMC2909313', 'PMC2770169', 'PMC2821766', 'PMC3585141', 'PMC2797517', 'PMC3339311', 'PMC1351169']
 
Binary Relevance Judgement for each predicted document in the pool: 

predicted_doc PMC2837245 is relevant
predicted_doc PMC3265445 is relevant
predicted_doc PMC3294595 is relevant
predicted_doc PMC2909313 is relevant
predicted_doc PMC2770169 is relevant
predicted_doc PMC2821766 is relevant
predicted_doc PMC3585141 is relevant
predicted_doc PMC2797517 is relevant
predi

In [None]:
# Document pool created for each query using IR1 and IR2 systems

In [29]:
A = pd.DataFrame(IR1_dict)
B = pd.DataFrame(IR2_dict)
                       
A.columns = pd.MultiIndex.from_product([['A'], A.columns])
B.columns = pd.MultiIndex.from_product([['B'], B.columns])
pool = pd.concat([A,B], axis = 1)
pool


Unnamed: 0_level_0,A,A,A,B,B,B
Unnamed: 0_level_1,Effect of environmental factors on virus,Covid-19 air travel safety,Transmission of virus in community,Effect of environmental factors on virus,Covid-19 air travel safety,Transmission of virus in community
0,PMC2837245,PMC3577649,PMC2851561,PMC2837245,PMC3577649,PMC2851561
1,PMC3265445,PMC3032737,PMC2893203,PMC3265445,PMC3032737,PMC3541974
2,PMC3294595,PMC2950238,PMC3227662,PMC3294595,PMC2950238,PMC3324376
3,PMC2909313,PMC3314701,PMC3509329,PMC2909313,PMC3314701,PMC2893203
4,PMC2770169,PMC2939898,PMC3324376,PMC2821766,PMC1764036,PMC3227662
5,PMC2821766,PMC2813231,PMC3266138,PMC3585141,PMC2912811,PMC3509329
6,PMC3585141,PMC1764036,PMC3484124,PMC2770169,PMC2939898,PMC2204055
7,PMC2797517,PMC2781002,PMC2206439,PMC1351169,PMC2813231,PMC3057078
8,PMC3339311,PMC3223866,PMC3086881,PMC2797517,PMC2823611,PMC3222642
9,PMC1351169,PMC2796493,PMC1876810,PMC2981509,PMC2796493,PMC3266138


In [30]:
set(['PMC2837245', 'PMC3265445', 'PMC3294595', 'PMC2909313', 'PMC2770169', 'PMC2821766', 'PMC3585141', 'PMC2797517', 'PMC1351169', 'PMC2837245', 'PMC3265445', 'PMC3294595', 'PMC2909313', 'PMC2821766', 'PMC3585141', 'PMC2770169', 'PMC1351169', 'PMC2797517', 'PMC2981509'])

{'PMC1351169',
 'PMC2770169',
 'PMC2797517',
 'PMC2821766',
 'PMC2837245',
 'PMC2909313',
 'PMC2981509',
 'PMC3265445',
 'PMC3294595',
 'PMC3585141'}

In [297]:
set(['PMC3577649', 'PMC3032737', 'PMC2950238', 'PMC3314701', 'PMC2939898', 'PMC2813231', 'PMC2781002', 'PMC3223866','PMC3577649', 'PMC3032737', 'PMC2950238', 'PMC3314701', 'PMC2939898', 'PMC2813231', 'PMC2823611' ])

{'PMC2781002',
 'PMC2813231',
 'PMC2823611',
 'PMC2939898',
 'PMC2950238',
 'PMC3032737',
 'PMC3223866',
 'PMC3314701',
 'PMC3577649'}

In [298]:
set(['PMC2851561','PMC3324376', 'PMC2893203', 'PMC3227662', 'PMC3222642', 'PMC3266138','PMC2851561', 'PMC2893203', 'PMC3227662', 'PMC3324376', 'PMC3266138', 'PMC3484124'])

{'PMC2851561',
 'PMC2893203',
 'PMC3222642',
 'PMC3227662',
 'PMC3266138',
 'PMC3324376',
 'PMC3484124'}