Load necessary packages

In [1]:
import pandas as pd
import numpy as np


Data loading and inspection

In [2]:
df = pd.read_json("../data/trec-medline.json", lines=True)


In [3]:
print(df.columns)
df.head(5)

Index(['index', 'AB', 'AD', 'CY', 'DA', 'DCOM', 'DP', 'EDAT', 'ID', 'IP', 'IS',
       'JID', 'LA', 'MHDA', 'PG', 'PMID', 'PST', 'PT', 'SB', 'SO', 'TA', 'TI',
       'UI', 'VI', 'MH', 'RN', 'FAU', 'LR', 'SI', 'RF', 'PS', 'EIN', 'CN',
       'AID', 'PHST', 'TT', 'CI', 'CON', 'CIN', 'RPF', 'RPI', 'SPIN', 'RIN',
       'ROF', 'ORI', 'UOF', 'UIN'],
      dtype='object')


Unnamed: 0,index,AB,AD,CY,DA,DCOM,DP,EDAT,ID,IP,...,CON,CIN,RPF,RPI,SPIN,RIN,ROF,ORI,UOF,UIN
0,{'_id': '1'},,,,,,,,,,...,,,,,,,,,,
1,,We present an evaluation of the accuracy and p...,Department of Molecular Biology and Skaggs Ins...,Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,GM56879/GM/NIGMS,1.0,...,,,,,,,,,,
2,{'_id': '2'},,,,,,,,,,...,,,,,,,,,,
3,,An analysis is presented of experimental versu...,"Department of Medical Biosciences, Medical Bio...",Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,,1.0,...,,,,,,,,,,
4,{'_id': '3'},,,,,,,,,,...,,,,,,,,,,


In [44]:
id_rows = df.iloc[::2].reset_index(drop=True)[["index"]]
id_rows["index"] = id_rows["index"].apply(lambda x: int(x["_id"]))
content_rows = df.iloc[1::2].reset_index(drop=True).drop(labels=["index"], axis=1)
combined_df = pd.concat([id_rows, content_rows], axis=1)

combined_df.head()

Unnamed: 0,index,AB,AD,CY,DA,DCOM,DP,EDAT,ID,IP,...,CON,CIN,RPF,RPI,SPIN,RIN,ROF,ORI,UOF,UIN
0,1,We present an evaluation of the accuracy and p...,Department of Molecular Biology and Skaggs Ins...,Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,GM56879/GM/NIGMS,1,...,,,,,,,,,,
1,2,An analysis is presented of experimental versu...,"Department of Medical Biosciences, Medical Bio...",Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,,1,...,,,,,,,,,,
2,3,The global fold of maltose binding protein in ...,Protein Engineering Network Center of Excellen...,Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,,1,...,,,,,,,,,,
3,4,A general method is presented for magnetic fie...,"Molecular Structure Division, National Institu...",Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,,1,...,,,,,,,,,,
4,5,The dependence between the anomeric carbon che...,"Department of Chemistry & Biochemistry, Univer...",Netherlands,20011105.0,20020401.0,2001 Sep,2001/11/06 10:00,,1,...,,,,,,,,,,


In [102]:
docs = combined_df[["index", "AB", "PMID"]]
docs = docs.astype({"index": int, "PMID": int})
docs.head(5)

Unnamed: 0,index,AB,PMID
0,1,We present an evaluation of the accuracy and p...,11693564
1,2,An analysis is presented of experimental versu...,11693565
2,3,The global fold of maltose binding protein in ...,11693566
3,4,A general method is presented for magnetic fie...,11693567
4,5,The dependence between the anomeric carbon che...,11693568


In [103]:
# load queries
queries = pd.DataFrame(columns=["index", "query"])

with open("../data/training-queries-simple.txt", "r") as f:
    lines = f.readlines()

data = []
for line in lines:
    x = line.strip().split("\t")
    if len(x) >= 2:  
        data.append({"index": int(x[0]), "query": x[1]})
    else:
        raise ValueError("wtf")
queries = pd.concat([queries, pd.DataFrame(data)], ignore_index=True)
queries.head(5)
print(queries.isna().sum())

index    0
query    0
dtype: int64


In [104]:
# drop missings
print(docs.isna().sum())
docs = docs.dropna()

index         0
AB       123568
PMID          0
dtype: int64


In [105]:
# find max words
max_words = docs['AB'].apply(lambda x: len(x.split())).max()
print(max_words)

1529


In [106]:
# load query results
query_res = pd.DataFrame(columns=["query_index", "doc_index", "relevant"])

with open("../data/training-qrels.txt", "r") as f:
    lines = f.readlines()

data = []
for line in lines:
    x = line.strip().split("\t")
    if len(x) >= 4:  
        data.append({"query_index": int(x[0]), "doc_index": int(x[2]), "relevant": int(x[3])})
    else:
        raise ValueError("wtf")
query_res = pd.concat([query_res, pd.DataFrame(data)], ignore_index=True)
print(query_res.head(5))
print(query_res.isna().sum())

  query_index doc_index relevant
0           1  11642719        1
1           1  11695244        1
2           1  11700040        1
3           1  11733969        1
4           1  11741909        1
query_index    0
doc_index      0
relevant       0
dtype: int64


In [107]:
# combine queries and results

filtered_df = query_res[query_res["relevant"] == 1]
grouped_df = filtered_df.groupby('query_index')['doc_index'].apply(list).reset_index()
grouped_df = grouped_df.rename(columns={'doc_index': 'relevant_docs'})
queries_training = pd.concat([queries, grouped_df], axis=1)
queries_training = queries_training.drop(columns=["query_index"])
queries_training.head(5)

Unnamed: 0,index,query,relevant_docs
0,1,"""cyclin-dependent kinase inhibitor 1A (p21, Ci...","[11642719, 11695244, 11700040, 11733969, 11741..."
1,2,"""DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide ...","[12101238, 12527917]"
2,3,ets variant gene 6 (TEL oncogene) in Homo sapiens,"[11731410, 11861293, 11861295, 12080468, 12091..."
3,4,fibroblast growth factor 7 (keratinocyte growt...,"[11937263, 11943656, 11973338, 12008951, 12016..."
4,5,"""glycine receptor, alpha 1 (startle disease/hy...","[11580237, 11781706, 11973623, 11981020, 11981..."


In [108]:
# inspect final datasetsts
queries_training.head(5)

Unnamed: 0,index,query,relevant_docs
0,1,"""cyclin-dependent kinase inhibitor 1A (p21, Ci...","[11642719, 11695244, 11700040, 11733969, 11741..."
1,2,"""DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide ...","[12101238, 12527917]"
2,3,ets variant gene 6 (TEL oncogene) in Homo sapiens,"[11731410, 11861293, 11861295, 12080468, 12091..."
3,4,fibroblast growth factor 7 (keratinocyte growt...,"[11937263, 11943656, 11973338, 12008951, 12016..."
4,5,"""glycine receptor, alpha 1 (startle disease/hy...","[11580237, 11781706, 11973623, 11981020, 11981..."


In [109]:
docs.head(5)

Unnamed: 0,index,AB,PMID
0,1,We present an evaluation of the accuracy and p...,11693564
1,2,An analysis is presented of experimental versu...,11693565
2,3,The global fold of maltose binding protein in ...,11693566
3,4,A general method is presented for magnetic fie...,11693567
4,5,The dependence between the anomeric carbon che...,11693568


In [110]:
# check if all doc ids from queries are in the dataset (after removing missings)
unique_relevant_docs = set(queries_training['relevant_docs'].explode())
existing_docs = unique_relevant_docs.intersection(docs.PMID)
missing_docs = unique_relevant_docs.difference(docs.PMID)

print(missing_docs)
print(f"Number of relevant docs: {len(unique_relevant_docs)}")
print(f"Number of existing docs in 'docs' DataFrame: {len(existing_docs)}")
print(f"Number of missing docs: {len(missing_docs)}")

{12147208, 11861518, 11688978, 11822867, 11714840, 12027934, 11374883, 11406125, 11042116, 11717190, 11700040, 11781193, 11781706, 11564874, 11580237, 11882578, 11846485, 11642719, 11685227, 11466351, 11752574, 11752575, 11779460, 11740559, 11727760, 12412576, 11686318, 11441070, 11809712, 11743158, 11701948, 11749055, 11842244, 11748297, 11733969, 11731410, 11741909, 11752172, 11751405, 12161015}
Number of relevant docs: 327
Number of existing docs in 'docs' DataFrame: 287
Number of missing docs: 40


In [114]:
# ofc there are missings relevant texts with no abstract. nice dataset:)
def filter_missing_docs(doc_list):
    return [doc for doc in doc_list if doc in existing_docs]
    
queries_training['relevant_docs'] = queries_training['relevant_docs'].apply(filter_missing_docs)
# remove quries with no docs


In [118]:
queries_training = queries_training[queries_training['relevant_docs'].apply(lambda x: len(x) > 0)]

In [120]:
# final results
queries_training = queries_training
docs = docs


Unnamed: 0,index,query,relevant_docs
0,1,"""cyclin-dependent kinase inhibitor 1A (p21, Ci...","[11695244, 11751903, 11756412, 11762751, 11872..."
1,2,"""DEAD/H (Asp-Glu-Ala-Asp/His) box polypeptide ...","[12101238, 12527917]"
2,3,ets variant gene 6 (TEL oncogene) in Homo sapiens,"[11861293, 11861295, 12080468, 12091359, 12127..."
3,4,fibroblast growth factor 7 (keratinocyte growt...,"[11937263, 11943656, 11973338, 12008951, 12016..."
4,5,"""glycine receptor, alpha 1 (startle disease/hy...","[11973623, 11981020, 11981021, 11994009, 12080..."
5,6,"""major histocompatibility complex, class II, D...","[11841486, 11914751, 11916169, 11953202, 11972..."
6,7,Janus kinase 2 (a protein tyrosine kinase) in ...,"[11923474, 11940567, 12106016, 12223098, 12351..."
7,8,luteinizing hormone/choriogonadotropin recepto...,"[11857565, 11943741, 12040016, 12088926, 12091..."
8,9,metallothionein 3 (growth inhibitory factor (n...,"[11849386, 12067712, 12111700, 12111700, 12538..."
9,10,protein C (inactivator of coagulation factors ...,"[11994010, 12029084, 12052963, 12063259, 12067..."
