In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import words
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#nltk.download('words')
#nltk.download('stopwords')
#nltk.download('punkt')

data_dir="datasets/"
dataset_name_dir="nfcorpus/"
#pd.read_csv(data_dir+dataset_name_dir+qrels/test.tsv", sep='\t', header=0)

In [2]:
corpus=pd.read_json(data_dir+dataset_name_dir+"corpus.jsonl", lines=True)
queries=pd.read_json(data_dir+dataset_name_dir+"queries.jsonl", lines=True)

In [3]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3633 entries, 0 to 3632
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   _id       3633 non-null   object
 1   title     3633 non-null   object
 2   text      3633 non-null   object
 3   metadata  3633 non-null   object
dtypes: object(4)
memory usage: 113.7+ KB


In [4]:
queries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3237 entries, 0 to 3236
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   _id       3237 non-null   object
 1   text      3237 non-null   object
 2   metadata  3237 non-null   object
dtypes: object(3)
memory usage: 76.0+ KB


### Sparse representation

In [5]:
def compute_sparse_repr(vocab: np.array, corpus: pd.DataFrame, queries: pd.DataFrame):
    doc_tfidf=TfidfVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')
    q_counter=CountVectorizer(lowercase=True, vocabulary=vocab, stop_words=None, token_pattern=r'\w+')

    sparse_doc=doc_tfidf.fit_transform(corpus["text"])
    sparse_q=q_counter.fit_transform(queries["text"])

    return sparse_doc, sparse_q

In [6]:
vocab=np.unique(np.char.lower(words.words()))
sparse_doc, sparse_q=compute_sparse_repr(vocab, corpus, queries)

#Here it's basically computed sparse_score=<q_sparse, d_sparse>
queries["sparse_scores"]=list(np.dot(sparse_q, sparse_doc.transpose()).toarray())

### Dense representation

In [7]:
def compute_dense_repr(corpus: pd.DataFrame, queries: pd.DataFrame):
    transformers = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    dense_c=transformers.encode(corpus["text"], convert_to_numpy = True)
    dense_q=transformers.encode(queries["text"], convert_to_numpy = True)
    
    return dense_c, dense_q

In [8]:
dense_c, dense_q=compute_dense_repr(corpus, queries)

#Here it's basically computed dense_score=<q_dense, d_dense>
queries["dense_scores"]=list(np.dot(dense_q, dense_c.transpose()))

In [9]:
corpus.head(2)

Unnamed: 0,_id,title,text,metadata
0,MED-10,Statin Use and Breast Cancer Survival: A Natio...,"Recent studies have suggested that statins, an...",{'url': 'http://www.ncbi.nlm.nih.gov/pubmed/25...
1,MED-14,Statin use after diagnosis of breast cancer an...,BACKGROUND: Preclinical studies have shown tha...,{'url': 'http://www.ncbi.nlm.nih.gov/pubmed/25...


In [10]:
queries.head(2)

Unnamed: 0,_id,text,metadata,sparse_scores,dense_scores
0,PLAIN-3,Breast Cancer Cells Feed on Cholesterol,{'url': 'http://nutritionfacts.org/2015/07/14/...,"[0.7687392942920818, 1.0108225877486743, 0.0, ...","[0.34855187, 0.37134382, 0.21230528, -0.013602..."
1,PLAIN-4,Using Diet to Treat Asthma and Eczema,{'url': 'http://nutritionfacts.org/2015/07/09/...,"[0.13288792638980879, 0.19449192345801758, 0.2...","[0.09630603, 0.067714095, 0.21574868, 0.107180..."


## Top k retrieval

In [11]:
k=5
queries["total_score_top_k"]=queries["sparse_scores"]+queries["dense_scores"]

In [12]:
queries["ground_truth_docs-score"]=queries.apply(lambda x: (np.sort(x["total_score_top_k"])[-k:], corpus.loc[np.argsort(x["total_score_top_k"])[-k:], "_id"].values), axis=1)
queries["ground_truth_docs"]=queries["ground_truth_docs-score"].apply(lambda x: x[1])
queries["ground_truth_docs"]=queries["ground_truth_docs"].apply(set)

## Top k' retrieval (approximate case)

In [13]:
k_prime=50

queries["index_sparse_top_k_prime"]=queries["sparse_scores"].apply(lambda x: np.argsort(x)[-k_prime:])
queries["index_dense_top_k_prime"]=queries["dense_scores"].apply(lambda x: np.argsort(x)[-k_prime:])

In [14]:
queries["index_union_top_k_prime"]=queries.apply(lambda x: np.concatenate((x["index_sparse_top_k_prime"], x["index_dense_top_k_prime"])), axis=1)
queries["index_union_top_k_prime"]=queries["index_union_top_k_prime"].apply(np.unique)

In [15]:
queries["approx_docs-score"]=queries.apply(
    lambda x: (np.sort(x["total_score_top_k"][x["index_union_top_k_prime"]])[-k:], 
               corpus.loc[np.argsort(x["total_score_top_k"]), "_id"].values[x["index_union_top_k_prime"]][-k:]), axis=1)

queries["approx_docs"]=queries["approx_docs-score"].apply(lambda x: x[1])
queries["approx_docs"]=queries["approx_docs"].apply(set)

In [16]:
queries["index_union_top_k_prime"][0]

array([   0,    1,   82,  530,  661,  886,  887,  890,  938,  943, 1103,
       1104, 1130, 1144, 1237, 1363, 1369, 1371, 1373, 1374, 1377, 1378,
       1379, 1380, 1382, 1383, 1433, 1514, 1596, 1786, 1787, 1795, 1798,
       1864, 1980, 2078, 2200, 2202, 2207, 2276, 2284, 2294, 2314, 2315,
       2317, 2318, 2319, 2323, 2324, 2327, 2333, 2336, 2338, 2473, 2495,
       2501, 2503, 2518, 2560, 2605, 2614, 2769, 2772, 2776, 2789, 2790,
       2912, 2933, 2940, 2961, 2963, 2977, 2997, 3029, 3035, 3057, 3097,
       3166, 3255, 3319, 3361, 3366, 3429, 3431, 3581, 3591, 3592],
      dtype=int64)

In [30]:

corpus.loc[np.argsort(queries["total_score_top_k"][0]), "_id"][queries["index_union_top_k_prime"][0]][queries["index_union_top_k_prime"][0]]

0         MED-10
1         MED-14
82       MED-838
530     MED-1414
661     MED-1564
          ...   
3429    MED-5182
3431    MED-5184
3581    MED-5341
3591    MED-5351
3592    MED-5352
Name: _id, Length: 87, dtype: object

In [18]:
corpus[corpus["_id"]=="MED-2439"]

Unnamed: 0,_id,title,text,metadata
1382,MED-2439,Plant Sterols as Anticancer Nutrients: Evidenc...,While many factors are involved in the etiolog...,{'url': 'http://www.ncbi.nlm.nih.gov/pubmed/23...


In [19]:
queries["index_union_top_k_prime"][0]

array([   0,    1,   82,  530,  661,  886,  887,  890,  938,  943, 1103,
       1104, 1130, 1144, 1237, 1363, 1369, 1371, 1373, 1374, 1377, 1378,
       1379, 1380, 1382, 1383, 1433, 1514, 1596, 1786, 1787, 1795, 1798,
       1864, 1980, 2078, 2200, 2202, 2207, 2276, 2284, 2294, 2314, 2315,
       2317, 2318, 2319, 2323, 2324, 2327, 2333, 2336, 2338, 2473, 2495,
       2501, 2503, 2518, 2560, 2605, 2614, 2769, 2772, 2776, 2789, 2790,
       2912, 2933, 2940, 2961, 2963, 2977, 2997, 3029, 3035, 3057, 3097,
       3166, 3255, 3319, 3361, 3366, 3429, 3431, 3581, 3591, 3592],
      dtype=int64)

## Evaluations

In [20]:
queries["recall"]=queries.apply(lambda x: len(x['ground_truth_docs'].intersection(x["approx_docs"]))/len(x['ground_truth_docs']), axis=1)
np.mean(queries["recall"])

0.08168056842755637

In [21]:
np.unique(queries["recall"])

array([0. , 0.2, 0.4, 0.6])

In [22]:
queries['ground_truth_docs'][0]

{'MED-14', 'MED-2103', 'MED-2437', 'MED-2439', 'MED-3551'}

In [23]:
queries["approx_docs"][0]

{'MED-1414', 'MED-1826', 'MED-2153', 'MED-3447', 'MED-4714'}

In [24]:
queries['ground_truth_docs-score'][0]

(array([1.32654511, 1.33828799, 1.38216641, 1.3916565 , 1.42346402]),
 array(['MED-2439', 'MED-2103', 'MED-14', 'MED-2437', 'MED-3551'],
       dtype=object))

In [25]:
queries["approx_docs-score"][0]

(array([1.32654511, 1.33828799, 1.38216641, 1.3916565 , 1.42346402]),
 array(['MED-2153', 'MED-1414', 'MED-3447', 'MED-4714', 'MED-1826'],
       dtype=object))