In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from transformers import BertTokenizer, BertModel
import torch
import os
from sklearn.metrics.pairwise import cosine_similarity
# no lowercasing here, as we do it ourselves later.
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False)
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
tqdm.pandas()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vitaf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_json("../data/trec-medline.json", lines=True)

In [3]:
id_rows = df.iloc[::2].reset_index(drop=True)[["index"]]
id_rows["index"] = id_rows["index"].apply(lambda x: int(x["_id"]))
content_rows = df.iloc[1::2].reset_index(drop=True).drop(labels=["index"], axis=1)
combined_df = pd.concat([id_rows, content_rows], axis=1)

combined_df.head()

Unnamed: 0,index,AB,AD,CY,DA,DCOM,DP,EDAT,ID,IP,...,CON,CIN,RPF,RPI,SPIN,RIN,ROF,ORI,UOF,UIN
0,1,We present an evaluation of the accuracy and p...,Department of Molecular Biology and Skaggs Ins...,Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,GM56879/GM/NIGMS,1,...,,,,,,,,,,
1,2,An analysis is presented of experimental versu...,"Department of Medical Biosciences, Medical Bio...",Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,,1,...,,,,,,,,,,
2,3,The global fold of maltose binding protein in ...,Protein Engineering Network Center of Excellen...,Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,,1,...,,,,,,,,,,
3,4,A general method is presented for magnetic fie...,"Molecular Structure Division, National Institu...",Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,,1,...,,,,,,,,,,
4,5,The dependence between the anomeric carbon che...,"Department of Chemistry & Biochemistry, Univer...",Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,,1,...,,,,,,,,,,


In [4]:
docs = combined_df[["index", "AB", "PMID"]]
docs = docs.astype({"index": int, "PMID": int})
docs.head(5)

Unnamed: 0,index,AB,PMID
0,1,We present an evaluation of the accuracy and p...,11693564
1,2,An analysis is presented of experimental versu...,11693565
2,3,The global fold of maltose binding protein in ...,11693566
3,4,A general method is presented for magnetic fie...,11693567
4,5,The dependence between the anomeric carbon che...,11693568


In [5]:
# load queries
queries = pd.DataFrame(columns=["index", "query"])

with open("../data/training-queries-simple.txt", "r") as f:
    lines = f.readlines()

data = []
for line in lines:
    x = line.strip().split("\t")
    if len(x) >= 2:  
        data.append({"index": int(x[0]), "query": x[1]})
    else:
        raise ValueError("wtf")
queries = pd.concat([queries, pd.DataFrame(data)], ignore_index=True)
queries.head(5)
print(queries.isna().sum())

index    0
query    0
dtype: int64


In [6]:
# drop missings
print(docs.isna().sum())
docs = docs.dropna()
def remove_short_strings(df, column_name):
    pattern = re.compile(r'\W+')
    filtered_df = df[df[column_name].apply(
        lambda x: isinstance(x, str) and len(pattern.sub('', x)) >= 20
    )].copy()
    filtered_df.reset_index(drop=True, inplace=True)
    
    return filtered_df
print(docs.shape[0])
docs = remove_short_strings(docs, "AB")
print(docs.shape[0])

index         0
AB       123568
PMID          0
dtype: int64
402369
401929


In [7]:
# find max words
max_words = docs['AB'].apply(lambda x: len(x.split())).max()
print(max_words)

1529


In [8]:
# load query results
query_res = pd.DataFrame(columns=["query_index", "doc_index", "relevant"])

with open("../data/training-qrels.txt", "r") as f:
    lines = f.readlines()

data = []
for line in lines:
    x = line.strip().split("\t")
    if len(x) >= 4:  
        data.append({"query_index": int(x[0]), "doc_index": int(x[2]), "relevant": int(x[3])})
    else:
        raise ValueError("wtf")
query_res = pd.concat([query_res, pd.DataFrame(data)], ignore_index=True)
print(query_res.head(5))
print(query_res.isna().sum())

  query_index doc_index relevant
0           1  11642719        1
1           1  11695244        1
2           1  11700040        1
3           1  11733969        1
4           1  11741909        1
query_index    0
doc_index      0
relevant       0
dtype: int64


In [9]:
# combine queries and results

filtered_df = query_res[query_res["relevant"] == 1]
grouped_df = filtered_df.groupby('query_index')['doc_index'].apply(list).reset_index()
grouped_df = grouped_df.rename(columns={'doc_index': 'relevant_docs'})
queries_training = pd.concat([queries, grouped_df], axis=1)
queries_training = queries_training.drop(columns=["query_index"])
queries_training.head(5)

Unnamed: 0,index,query,relevant_docs
0,1,"""cyclin-dependent kinase inhibitor 1A (p21, Ci...","[11642719, 11695244, 11700040, 11733969, 11741..."
1,2,"""DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide ...","[12101238, 12527917]"
2,3,ets variant gene 6 (TEL oncogene) in Homo sapiens,"[11731410, 11861293, 11861295, 12080468, 12091..."
3,4,fibroblast growth factor 7 (keratinocyte growt...,"[11937263, 11943656, 11973338, 12008951, 12016..."
4,5,"""glycine receptor, alpha 1 (startle disease/hy...","[11580237, 11781706, 11973623, 11981020, 11981..."


In [10]:
# inspect final datasetsts
queries_training.head(5)

Unnamed: 0,index,query,relevant_docs
0,1,"""cyclin-dependent kinase inhibitor 1A (p21, Ci...","[11642719, 11695244, 11700040, 11733969, 11741..."
1,2,"""DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide ...","[12101238, 12527917]"
2,3,ets variant gene 6 (TEL oncogene) in Homo sapiens,"[11731410, 11861293, 11861295, 12080468, 12091..."
3,4,fibroblast growth factor 7 (keratinocyte growt...,"[11937263, 11943656, 11973338, 12008951, 12016..."
4,5,"""glycine receptor, alpha 1 (startle disease/hy...","[11580237, 11781706, 11973623, 11981020, 11981..."


In [11]:
docs.head(5)

Unnamed: 0,index,AB,PMID
0,1,We present an evaluation of the accuracy and p...,11693564
1,2,An analysis is presented of experimental versu...,11693565
2,3,The global fold of maltose binding protein in ...,11693566
3,4,A general method is presented for magnetic fie...,11693567
4,5,The dependence between the anomeric carbon che...,11693568


In [12]:
# check if all doc ids from queries are in the dataset (after removing missings)
unique_relevant_docs = set(queries_training['relevant_docs'].explode())
existing_docs = unique_relevant_docs.intersection(docs.PMID)
missing_docs = unique_relevant_docs.difference(docs.PMID)

print(missing_docs)
print(f"Number of relevant docs: {len(unique_relevant_docs)}")
print(f"Number of existing docs in 'docs' DataFrame: {len(existing_docs)}")
print(f"Number of missing docs: {len(missing_docs)}")

{12147208, 12147209, 11861518, 11688978, 11822867, 11714840, 12027934, 11374883, 11406125, 11042116, 11717190, 11700040, 11781193, 11781706, 11564874, 11580237, 11882578, 11846485, 11642719, 11685227, 11466351, 11841916, 11752574, 11752575, 11779460, 11740559, 11727760, 12412576, 11686318, 11441070, 11809712, 11743158, 11701948, 11749055, 11842244, 11748297, 11733969, 11731410, 11741909, 11752172, 11751405, 12161015}
Number of relevant docs: 327
Number of existing docs in 'docs' DataFrame: 285
Number of missing docs: 42


In [13]:
# a smaller dataset
existing_docs = set(unique_relevant_docs).intersection(set(docs['PMID']))

In [14]:
# ofc there are missings relevant texts with no abstract. nice dataset:)
def filter_missing_docs(doc_list):
    return [doc for doc in doc_list if doc in existing_docs]
    
queries_training['relevant_docs'] = queries_training['relevant_docs'].apply(filter_missing_docs)
# remove quries with no docs

In [15]:
queries_training = queries_training[queries_training['relevant_docs'].apply(lambda x: len(x) > 0)]

In [16]:
queries_training = queries_training.drop(columns=["index"])
docs = docs.drop(columns=["index"])

In [17]:
def preprocess_text(text, remove_stopwords=True):
    copy = text
    copy = copy.lower()
    # remove punctuation
    copy = re.sub(r"[^\w\s]", '', copy)
    # remove double whitespaces.
    copy = re.sub(r'\s+', ' ', copy).strip()
    
    if remove_stopwords:
        copy = ' '.join(w for w in copy.split() if w not in stop_words)
    return copy

def tokenize_text(text):
    copy = tokenizer.tokenize(text)
    return copy

In [18]:
def precision_at_k(row, k=10):
    retrieved = row['retrieved_docs'][:k]
    relevant = set(row['relevant_docs'])
    if k == 0:
        return 0.0
    num_relevant_in_retrieved = len(set(retrieved) & relevant)
    precision = (num_relevant_in_retrieved / k) * 100
    return precision


In [19]:
model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def get_embeddings_words(tokens, window=512):
    window = window - 2
    embeddings = []
    for i in range(0, len(tokens), window):
        subset = tokens[i:i+window]
        subset = [tokenizer.cls_token] + subset + [tokenizer.sep_token]
        ids = tokenizer.convert_tokens_to_ids(subset)
        masks = [1] * len(ids)
        ids_tensor = torch.tensor([ids]).to(device)
        mask_tensor = torch.tensor([masks]).to(device)
        
        with torch.no_grad():
            outputs = model(ids_tensor, attention_mask=mask_tensor)
            token_embeddings = outputs[0].squeeze().cpu().numpy()
            embeddings.extend(token_embeddings)
        
    return embeddings

  state_dict = torch.load(resolved_archive_file, map_location='cpu')


In [20]:
# tf-idf embedding function
def get_embeddings_words_tfidf(tokens, doc_list):
    # this function is called on a dataframe, once for each doc/query in the dataframe. tokens is a list of 
    # all the tokens in that doc/query, passed from the dataframe for each doc. doc_list does not change, so 
    # it's not changed. doc_list is a dataframe this function is called on:))))), but not row-by-row.
    # I would personally look into tf-idf and precompute idf before this function, passing the idf dataframe
    # instead in place of doc_list, as idf is the same for the entire corspus. This would prevent a lot of
    # unnecessary computations.
    # tokens = list of tokens for one doc/query
    # return matrix of embeddings = a list of vectors, where each vector - embedding of a word in a sequence.
    # vectors should be of the same dimension for all input sequences, 
    # the number of vectors in the matrix list doens't matter.
    # e.g. [[1, 2, 3], [1, 2, 3]] returned for one set of tokens and [[1, 2, 3]] for another, but
    # [[1, 2, 3], [1, 2, 3]] and [[1, 2, 3, 4], [1, 2, 3, 4]] is bad. Each token in any input sequence
    # should get a vector feature of a fixed size. See the example of returned docs.
    # when doing tf-idf, please only use token column, do not use text column.
    # to speed up computing, you can move parts of tf-idf to a separate function and set it to a variable
    # (e.g. parts which are the same, like idf).
    return [[1, 2], [1, 2], [3, 2]]

In [21]:
# try word-level
# Select the subset of the docs present in the query answers.
selected_docs = docs[docs['PMID'].isin(existing_docs)]
selected_docs_copy = selected_docs.copy(deep=True)

# preprocess and tokenize docs
print("Preprocessing docs")
selected_docs_copy["tokens"] = selected_docs_copy["AB"].progress_apply(preprocess_text)
print("Tokenizing docs")
selected_docs_copy["tokens"] = selected_docs_copy["tokens"].progress_apply(tokenize_text)

# preprocess and tokenize queries
selected_queries = queries_training.copy(deep=True)
print("Preprocessing queries")
selected_queries["tokens"] = selected_queries["query"].progress_apply(preprocess_text)
print("Tokenizing queriess")
selected_queries["tokens"] = selected_queries["tokens"].progress_apply(tokenize_text)

Preprocessing docs


100%|██████████| 285/285 [00:00<00:00, 13335.75it/s]


Tokenizing docs


100%|██████████| 285/285 [00:00<00:00, 672.83it/s]


Preprocessing queries


100%|██████████| 47/47 [00:00<?, ?it/s]


Tokenizing queriess


100%|██████████| 47/47 [00:00<00:00, 8630.26it/s]


In [22]:
# copies for tf-idf
selected_docs_tfidf = selected_docs_copy.copy(deep=True)
selected_queries_tfidf = selected_queries.copy(deep=True)

In [23]:
# calculate embedding matrices using bert
selected_docs_copy["embeddings"] = selected_docs_copy["tokens"].progress_apply(get_embeddings_words)
selected_queries["embedding"] = selected_queries["tokens"].progress_apply(get_embeddings_words)
selected_queries.head(2)

100%|██████████| 285/285 [00:02<00:00, 121.96it/s]
100%|██████████| 47/47 [00:00<00:00, 141.54it/s]


Unnamed: 0,query,relevant_docs,tokens,embedding
0,"""cyclin-dependent kinase inhibitor 1A (p21, Ci...","[11695244, 11751903, 11756412, 11762751, 11872...","[cy, ##cl, ##ind, ##ep, ##end, ##ent, kinase, ...","[[-0.79326123, -0.11772958, -0.56752604, 0.037..."
1,"""DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide ...","[12101238, 12527917]","[dead, ##h, as, ##pg, ##lu, ##ala, ##as, ##phi...","[[-0.6105297, 0.026877305, -0.14515096, -0.001..."


In [24]:
# calculate embedding matrices using tf-idf
selected_docs_tfidf["embeddings"] = selected_docs_tfidf["tokens"].progress_apply(
    get_embeddings_words_tfidf, doc_list=selected_docs_tfidf)
selected_queries_tfidf["embedding"] = selected_queries_tfidf["tokens"].progress_apply(
    get_embeddings_words_tfidf, doc_list=selected_docs_tfidf)
selected_queries.head(2)

100%|██████████| 285/285 [00:00<00:00, 22700.33it/s]
100%|██████████| 47/47 [00:00<00:00, 46669.58it/s]


Unnamed: 0,query,relevant_docs,tokens,embedding
0,"""cyclin-dependent kinase inhibitor 1A (p21, Ci...","[11695244, 11751903, 11756412, 11762751, 11872...","[cy, ##cl, ##ind, ##ep, ##end, ##ent, kinase, ...","[[-0.79326123, -0.11772958, -0.56752604, 0.037..."
1,"""DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide ...","[12101238, 12527917]","[dead, ##h, as, ##pg, ##lu, ##ala, ##as, ##phi...","[[-0.6105297, 0.026877305, -0.14515096, -0.001..."


In [25]:
def retrieve_all_docs_df(query_embeddings, dataset, top_n=10):
    # Convert query embeddings to a NumPy array
    query_embeddings = np.array(query_embeddings).astype(np.float32)

    # Extract document embeddings and PMIDs
    vectors_list = dataset["embeddings"].values
    pmids = dataset["PMID"].values.astype(np.int64)

    # Initialize a list to store similarity scores
    scores = []

    # Iterate over each document's embeddings
    for doc_emb in vectors_list:
        # Convert document embeddings to a NumPy array
        doc_emb = np.array(doc_emb).astype(np.float32)
        
        # Compute the cosine similarity matrix between query and document embeddings
        cos_sim_matrix = cosine_similarity(query_embeddings, doc_emb)
        
        # For each query embedding, find the maximum similarity with document embeddings
        max_similarities = np.max(cos_sim_matrix, axis=1)
        
        # Average the maximum similarities to get a single score for the document
        score = np.mean(max_similarities)
        scores.append(score)
    
    # Create a DataFrame with 'doc_id' and 'similarity_score' columns
    df = pd.DataFrame({
        'doc_id': pmids,
        'similarity_score': scores
    })

    # Sort the DataFrame by 'similarity_score' in descending order
    df_sorted = df.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)
    top_doc_ids = df_sorted['doc_id'].head(top_n).tolist()
    
    return top_doc_ids

In [26]:
# bert result
selected_queries["retrieved_docs"] = selected_queries["embedding"].progress_apply(retrieve_all_docs_df, 
                                                                                  dataset=selected_docs_copy)
selected_queries["precission_at_k"] = selected_queries.apply(precision_at_k, axis=1)
print(selected_queries["precission_at_k"].mean())

100%|██████████| 47/47 [00:07<00:00,  5.93it/s]

28.085106382978722





In [28]:
# tf-idf result
selected_queries_tfidf["retrieved_docs"] = selected_queries_tfidf["embedding"].progress_apply(retrieve_all_docs_df, 
                                                                                  dataset=selected_docs_tfidf)
selected_queries_tfidf["precission_at_k"] = selected_queries_tfidf.apply(precision_at_k, axis=1)
print(selected_queries_tfidf["precission_at_k"].mean())

100%|██████████| 47/47 [00:02<00:00, 19.71it/s]

2.127659574468085





In [31]:
# paired t-test
from scipy.stats import ttest_rel
from scipy import stats

bert_scores = selected_queries["precission_at_k"].to_numpy()
tfidf_scores = selected_queries_tfidf["precission_at_k"].to_numpy()

t_stat, p_two_tailed = ttest_rel(bert_scores, tfidf_scores)

p_one_tailed = stats.t.sf(t_stat, df=len(bert_scores - 1))

print(f"t-statistic: {t_stat}")
print(f"p-value (one-tailed): {p_one_tailed}")



t-statistic: 7.095797745078743
p-value (one-tailed): 2.9252322812446207e-09
