In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import words
from sentence_transformers import SentenceTransformer
from sklearn.metrics import ndcg_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#nltk.download('words')
#nltk.download('stopwords')
#nltk.download('punkt')

data_dir="datasets/"
dataset_name_dir="trec-covid/"

In [2]:
corpus=pd.read_json(data_dir+dataset_name_dir+"corpus.jsonl", lines=True)
queries=pd.read_json(data_dir+dataset_name_dir+"queries.jsonl", lines=True)

In [3]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171332 entries, 0 to 171331
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   _id       171332 non-null  object
 1   title     171332 non-null  object
 2   text      171332 non-null  object
 3   metadata  171332 non-null  object
dtypes: object(4)
memory usage: 5.2+ MB


In [4]:
queries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   _id       50 non-null     int64 
 1   text      50 non-null     object
 2   metadata  50 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.3+ KB


### Sparse representation

In [5]:
def compute_sparse_repr(vocab: np.array, corpus: pd.DataFrame, queries: pd.DataFrame):
    doc_tfidf=TfidfVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')
    q_counter=CountVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')

    sparse_doc=doc_tfidf.fit_transform(corpus["text"])
    sparse_q=q_counter.fit_transform(queries["text"])

    return sparse_doc, sparse_q

In [6]:
vocab=np.unique(np.char.lower(words.words()))
sparse_doc, sparse_q=compute_sparse_repr(vocab, corpus, queries)

#Here it's basically computed sparse_score=<q_sparse, d_sparse>
queries["sparse_scores"]=list(np.dot(sparse_q, sparse_doc.transpose()).toarray())

### Dense representation

In [7]:
def compute_dense_repr(corpus: pd.DataFrame, queries: pd.DataFrame):
    transformers = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    dense_c=transformers.encode(corpus["text"], convert_to_numpy = True)
    dense_q=transformers.encode(queries["text"], convert_to_numpy = True)
    
    return dense_c, dense_q

In [8]:
dense_c, dense_q=compute_dense_repr(corpus, queries)

#Here it's basically computed dense_score=<q_dense, d_dense>
queries["dense_scores"]=list(np.dot(dense_q, dense_c.transpose()))

In [9]:
corpus.head(2)

Unnamed: 0,_id,title,text,metadata
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,{'url': 'https://www.ncbi.nlm.nih.gov/pmc/arti...


In [10]:
queries.head(2)

Unnamed: 0,_id,text,metadata,sparse_scores,dense_scores
0,1,what is the origin of COVID-19,"{'query': 'coronavirus origin', 'narrative': '...","[0.33793836903942276, 0.5684452940252585, 0.51...","[0.18737775, 0.05722889, 0.10594031, 0.0413222..."
1,2,how does the coronavirus respond to changes in...,{'query': 'coronavirus response to weather cha...,"[0.5107639433480153, 0.6254632143228629, 0.662...","[0.11220726, 0.05681237, 0.07420538, 0.0760865..."


In [11]:
corpus.to_parquet("corpus_dump.parquet")
queries.to_parquet("queries_dump.parquet")

## Top k retrieval

In [12]:
corpus=pd.read_parquet("corpus_dump.parquet")
queries=pd.read_parquet("queries_dump.parquet")

In [15]:
k=5
queries["total_score_top_k"]=queries["sparse_scores"]+queries["dense_scores"]

In [16]:
queries["exact_docs-score"]=queries.apply(lambda x: zip(np.sort(x["total_score_top_k"])[:-k-1:-1], corpus.loc[np.argsort(x["total_score_top_k"])[:-k-1:-1], "_id"].values), axis=1)
queries["exact_docs-score"]=queries["exact_docs-score"].apply(list)
queries["exact_docs"]=queries["exact_docs-score"].apply(lambda x: [elem[1] for elem in x])

## Top k' retrieval (approximate case)

In [18]:
k_prime=k

queries["index_sparse_top_k_prime"]=queries["sparse_scores"].apply(lambda x: np.argsort(x)[-k_prime:])
queries["index_dense_top_k_prime"]=queries["dense_scores"].apply(lambda x: np.argsort(x)[-k_prime:])

queries["index_union_top_k_prime"]=queries.apply(lambda x: np.concatenate((x["index_sparse_top_k_prime"], x["index_dense_top_k_prime"])), axis=1)
queries["index_union_top_k_prime"]=queries["index_union_top_k_prime"].apply(np.unique)

In [19]:
queries["approx_docs-score"]=queries.apply(
    lambda x: sorted([(x["total_score_top_k"][idx], corpus.iloc[idx]["_id"]) for idx in x["index_union_top_k_prime"]], reverse=True)[:k], axis=1)

queries["approx_docs"]=queries["approx_docs-score"].apply(lambda x: [elem[1] for elem in x])
queries["approx_scores"]=queries["approx_docs-score"].apply(lambda x: [elem[0] for elem in x])

## Evaluations

In [20]:
queries["recall"]=queries.apply(lambda x: len(np.intersect1d(x['exact_docs'], x["approx_docs"], assume_unique=True))/len(x['exact_docs']), axis=1)
np.mean(queries["recall"])

0.5880000000000001

In [47]:
true_lab_df=pd.read_csv(data_dir+dataset_name_dir+"qrels/test.tsv", sep='\t', header=0)

In [48]:
true_lab_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66336 entries, 0 to 66335
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query-id   66336 non-null  int64 
 1   corpus-id  66336 non-null  object
 2   score      66336 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [49]:
""" for row in queries.itertuples(index=False):
    for approx_doc in row.approx_docs:
        true_lab_df=true_lab_df.drop(true_lab_df[true_lab_df["query-id"]==row[0] & ~(true_lab_df["corpus-id"]==approx_doc)].index)

for row in queries.itertuples(index=False):
    scores=true_lab_df[true_lab_df["query-id"]==row[0]]["score"][:k]
    if len(scores)>0:
        print(ndcg_score([row.approx_scores], [scores], k=k)) """

In [None]:
""" queries["recall"].unique()
queries['exact_docs'][0]
queries["approx_docs"][0]
queries['exact_docs-score'][0]
queries["approx_docs-score"][0]
queries.explode("approx_docs").shape
queries["recall"][0] """