In [1]:
from re import L
from sklearn import metrics
import rank_metric as metrics
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer
import xml.etree.ElementTree as ET
import matplotlib as plt
import pandas as pd
import numpy as np
import pprint as pp
import tarfile
from sklearn.feature_extraction.text import CountVectorizer
import math
import plotly.express as px

In [4]:
#parsing real patient cases
import pickle
import os
trials = "data/qrels-clinical_trials.txt"
trials = pd.read_csv(trials, sep='\t', names=["query_id", "dummy", "docid", "rel"])
trials = trials.drop("dummy", axis = 1)
trials_to_use = set(trials['docid'])

#parsing clinical trials
tar = tarfile.open("data/clinicaltrials.gov-16_dec_2015.tgz", "r:gz")
clinical_trials = {}
docs = []
ids = []
i = 0
for element in tar:
    if element.size > 500:
        txt = tar.extractfile(element).read().decode("utf-8", "strict")
        root = ET.fromstring(txt)
        for doc_id in root.iter('nct_id'):
            temp_id = doc_id.text
        if temp_id not in trials_to_use:
            continue
        i+=1
        for brief_title in root.iter('brief_title'):
            docs.append(brief_title.text)
            ids.append(temp_id)
            clinical_trials[temp_id] = brief_title.text
print(f"Number of trials to use: {i}")

pickle.dump(docs, open( "data/documents.bin", "wb" ) )
pickle.dump(ids, open( "data/doc_ids.bin", "wb" ) )


Number of trials to use: 3626


In [5]:
Queries = "data/topics-2014_2015-summary.topics"
with open(Queries, 'r') as queries_reader:
    txt = queries_reader.read()
root = ET.fromstring(txt)
cases = {}
for q in root.iter('TOP'):
    q_number = q.find("NUM").text
    q_title = q.find('TITLE').text
    cases[q_number] = q_title

## Class for functions used by Language Model with Jelineck-Mercer smooting and Vector Space Model

In [6]:
class NLP:
    def __init__(self, eval_table: pd.DataFrame, cases: dict, ngram_range, docs, ids):
        # self.docs = documents
        self.eval_table = eval_table
        self.docs = docs
        self.ids = ids
        self.cases = cases
        self.vsm = TfidfVectorizer(ngram_range=ngram_range, analyzer='word', stop_words = None)
        self.X = self.vsm.fit_transform(docs)
        self.vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word', stop_words = None)
        self.Y=self.vectorizer.fit_transform(docs)

    def test_vsm(self, query):
        tfidf = self.vsm.transform([query])
        score = 1 - pairwise_distances(self.X, tfidf, metric='cosine')
        results = pd.DataFrame(list(zip(self.ids, score)), columns=['_id', 'score'])
        ordered_results = results.sort_values(by=['score'], ascending=False).reset_index(drop=True)
        return ordered_results

    def eval(self, result: pd.DataFrame, id):
        def_sim = self.eval_table[self.eval_table["query_id"] == int(id)]
        def_sim = def_sim[def_sim["rel"] != 0]
        rel_ids = def_sim["docid"]
        num_of_rel = def_sim["rel"].count()
        num_of_results = result["_id"].count()

        if num_of_rel == 0:
            return [0, 0, 0, 0, 0]

        #precision@10
        top10 = result['_id'][:10]
        p10 = np.intersect1d(top10, rel_ids).size / 10

        #recall
        recall = np.size(np.intersect1d(result["_id"][:100], rel_ids)) / num_of_rel
        rel_res_vector = np.zeros((num_of_results,))
        for index, row in def_sim.iterrows():
            rel_res_vector = rel_res_vector + ((result['_id'] == row.docid)*row.rel).to_numpy()
        #ndcg5
        ndcg5 = metrics.ndcg_at_k(r = rel_res_vector, k = 5, method = 1)
        ap = metrics.average_precision(rel_res_vector, num_of_rel)
        mrr = metrics.mean_reciprocal_rank(rel_res_vector)

        return [p10, 1 - recall, ndcg5, ap, mrr]

    def evalPR(self, scores, query_id):

        aux = self.eval_table.loc[self.eval_table['query_id'] == int(query_id)]
        idx_rel_docs = aux.loc[aux['rel'] != (0)]

        [dummyA, rank_rel, dummyB] = np.intersect1d(scores['_id'], idx_rel_docs['docid'], return_indices=True)
        rank_rel = np.sort(rank_rel) + 1
        total_relv_ret = rank_rel.shape[0]
        if total_relv_ret == 0:
            return [np.zeros(11, ), [], total_relv_ret]

        recall = np.arange(1, total_relv_ret + 1) / idx_rel_docs.shape[0]
        precision = np.arange(1, total_relv_ret + 1) / rank_rel

        precision_interpolated = np.maximum.accumulate(precision)
        recall_11point = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        precision_11point = np.interp(recall_11point, recall, precision)

        if False:
            print(total_relv_ret)
            print(rank_rel)
            print(recall)
            print(precision)
            plt.plot(recall, precision, color='b', alpha=1)  # Raw precision-recall
            plt.plot(recall, precision_interpolated, color='r', alpha=1)  # Interpolated precision-recall
            plt.plot(recall_11point, precision_11point, color='g', alpha=1)  # 11-point interpolated precision-recall

        return [precision_11point, recall_11point, total_relv_ret]


    def words_and_indexes_association(self,query):
        q_words=[]
        word_idx=[]
        all_words=self.vectorizer.get_feature_names_out()
        for x in range(len(query)):
            for y in range(len(all_words)):
                if query[x] == all_words[y]:
                    q_words.append(query[x])
                    word_idx.append(y)
                    break
        return word_idx

    def JMS(self):
        x_array=self.Y.toarray()
        x_array=np.array(x_array)
        #sum of rows
        p1=np.sum(x_array,axis=0)
        #sum of cols
        p2=np.sum(x_array,axis=1)
        #number of words in whole corpus
        p3=np.sum(x_array)
        #probability of term in corpus
        matrix_corpus=np.divide(p1,p3)
        #probability of term in document
        p2=np.reshape(p2,(p2.size,1))
        matrix_documents=np.divide(x_array,p2)  
        return matrix_corpus,matrix_documents

    def query_formating(self,query):
        query=query[:-11]
        for y in query:
                if y in ".,":
                    query=query.replace(y,'')
                
        return query 

    def prob_with_JMS(self,index_word,index_doc,lamb,matrix_corp,matrix_doc):
        prob=0
        for x in index_word:
            p_md=matrix_doc[index_doc][x]
            p_mc=matrix_corp[x]
            prob+=math.log(lamb*p_md+(1-lamb)*p_mc)
        return prob

    def JSM_test(self,query,lamb,matrix_corpus,matrix_documents):
        scores_now=[]
        q=query.split()
        word_idx=self.words_and_indexes_association(q)

        for x in range(len(self.ids)):
            prob=self.prob_with_JMS(word_idx,x,lamb,matrix_corpus,matrix_documents)
            scores_now.append(prob)
        results = pd.DataFrame(list(zip(ids, scores_now)), columns=['_id', 'score'])
        ordered_results = results.sort_values(by=['score'], ascending=False).reset_index(drop=True)
        return ordered_results
    



# Dataset split

In [7]:
import random
ids_split = list(cases.keys())
random.shuffle(ids_split)
ids_training = ids_split[:math.ceil(0.8*len(ids_split))]
ids_test = ids_split[math.ceil(0.8*len(ids_split)):]
cases_training = {}
cases_test = {}
for id in ids_training:
    cases_training[id] = cases[id]
for id in ids_test:
    cases_test[id] = cases[id]

# Vector Space Model with different range of n-grams

In [8]:
records = []
statistics = {}
for ngram in [(1,1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)]:
    myNLP = NLP(trials, cases_training, ngram, docs, ids)
    prec = np.zeros(11,)
    test = np.zeros(5,)
    for x in myNLP.cases.items():
        results = myNLP.test_vsm(x[1])
        p10, recall, ndcg5, ap, mrr = myNLP.eval(results, x[0])
        test = test + np.array([p10, ndcg5, mrr, ap, recall])
        prec_11point, recal_11point, tot_relv_ret = myNLP.evalPR(results, x[0])
        prec = prec + prec_11point
        
    prec = prec / len(cases.keys())
    test = test / len(cases.keys())
    for p, r in zip(prec, recal_11point):
        records.append((ngram, p, r))
    statistics[ngram] = test

df = pd.DataFrame.from_records(records, columns=["ngram", "precision_11", "recall_11"])
stats = pd.DataFrame.from_dict(statistics, orient="index", columns=["p10", "ndcg5", "mrr", "ap", "recall"])


In [9]:
stats

Unnamed: 0,p10,ndcg5,mrr,ap,recall
"(1, 1)",0.06,0.050604,0.004311,0.037722,0.619257
"(1, 2)",0.05,0.045838,0.004311,0.035708,0.626312
"(2, 2)",0.031667,0.037646,0.004311,0.019977,0.727448
"(1, 3)",0.048333,0.042775,0.004311,0.033755,0.623876
"(2, 3)",0.031667,0.035369,0.004311,0.019659,0.726547
"(3, 3)",0.006667,0.010705,0.004311,0.008469,0.750652


In [10]:
fig = px.line(df, x="recall_11", y="precision_11", color='ngram')
fig.show()

# Jelineck-Mercer smoothing with different lambdas(smoothing variables)


In [11]:
myNLP = NLP(trials, cases_training, (1, 1), docs, ids)
records_jms = []
statistics_jms = {}
matrix_corpus,matrix_documents=myNLP.JMS()
for lamb in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]:
    prec = np.zeros(11,)
    test = np.zeros(5,)
    

    for x in myNLP.cases.items():
        query=myNLP.query_formating(x[1])
        results=myNLP.JSM_test(query,lamb,matrix_corpus,matrix_documents)
        p10,recall,ndcg5,ap,mrr=myNLP.eval(results,x[0])
        test = test + np.array([p10, ndcg5, mrr, ap, recall])
        prec_11point, recal_11point, tot_relv_ret = myNLP.evalPR(results, x[0])
        prec = prec + prec_11point
    
    prec = prec / len(cases.keys())
    test = test / len(cases.keys())
    for p, r in zip(prec, recal_11point):
        records_jms.append((lamb, p, r))
    statistics_jms[lamb] = test
df_jms = pd.DataFrame.from_records(records_jms, columns=["lambda", "precision_11", "recall_11"])
stats = pd.DataFrame.from_dict(statistics_jms, orient="index", columns=["p10", "ndcg5", "mrr", "ap", "recall"])

In [12]:
stats

Unnamed: 0,p10,ndcg5,mrr,ap,recall
0.0,0.001667,0.005653,0.004311,0.006251,0.753521
0.1,0.06,0.050654,0.004311,0.041675,0.614117
0.2,0.058333,0.052337,0.004311,0.041529,0.611727
0.3,0.056667,0.052342,0.004311,0.040904,0.61452
0.4,0.058333,0.055916,0.004311,0.040221,0.619632
0.5,0.056667,0.053753,0.004311,0.039135,0.621924
0.6,0.053333,0.049373,0.004311,0.037526,0.624772
0.7,0.053333,0.049307,0.004311,0.036721,0.628207
0.8,0.051667,0.048195,0.004311,0.035847,0.638779
0.9,0.041667,0.046643,0.004311,0.032979,0.656997


In [13]:
fig = px.line(df_jms, x="recall_11", y="precision_11", color='lambda')
fig.show()

In [14]:
myNLP = NLP(trials, cases_training, (1, 1), docs, ids)
records_jms = []
statistics_jms = {}
matrix_corpus,matrix_documents=myNLP.JMS()
lamb = 0
for i in range(30):
    lamb += 0.01
    lamb = round(lamb, 2)
    prec = np.zeros(11,)
    test = np.zeros(5,)
    

    for x in myNLP.cases.items():
        query=myNLP.query_formating(x[1])
        results=myNLP.JSM_test(query,lamb,matrix_corpus,matrix_documents)
        p10,recall,ndcg5,ap,mrr=myNLP.eval(results,x[0])
        test = test + np.array([p10, ndcg5, mrr, ap, recall])
        prec_11point, recal_11point, tot_relv_ret = myNLP.evalPR(results, x[0])
        prec = prec + prec_11point
    
    prec = prec / len(cases.keys())
    test = test / len(cases.keys())
    for p, r in zip(prec, recal_11point):
        records_jms.append((lamb, p, r))
    statistics_jms[lamb] = test
df_jms = pd.DataFrame.from_records(records_jms, columns=["lambda", "precision_11", "recall_11"])
stats = pd.DataFrame.from_dict(statistics_jms, orient="index", columns=["p10", "ndcg5", "mrr", "ap", "recall"])

In [15]:
stats

Unnamed: 0,p10,ndcg5,mrr,ap,recall
0.01,0.05,0.043693,0.004311,0.038414,0.609927
0.02,0.053333,0.04392,0.004311,0.039411,0.611453
0.03,0.053333,0.048275,0.004311,0.04055,0.610276
0.04,0.053333,0.046815,0.004311,0.040943,0.609879
0.05,0.055,0.047407,0.004311,0.041587,0.611379
0.06,0.051667,0.046897,0.004311,0.041603,0.609393
0.07,0.056667,0.050341,0.004311,0.042199,0.610384
0.08,0.058333,0.052141,0.004311,0.042257,0.611297
0.09,0.06,0.051771,0.004311,0.042313,0.614117
0.1,0.06,0.050654,0.004311,0.041675,0.614117


In [16]:
# fig = px.line(df_jms, x="recall_11", y="precision_11", color='lambda')
# fig.show()

# ngram_range = (1,1)
# Lambda = 0.12

## Testing

In [17]:
records = []
statistics = {}
for ngram in [(1,1)]:
    myNLP = NLP(trials, cases_test, ngram, docs, ids)
    prec = np.zeros(11,)
    test = np.zeros(5,)
    for x in myNLP.cases.items():
        results = myNLP.test_vsm(x[1])
        p10, recall, ndcg5, ap, mrr = myNLP.eval(results, x[0])
        test = test + np.array([p10, ndcg5, mrr, ap, recall])
        prec_11point, recal_11point, tot_relv_ret = myNLP.evalPR(results, x[0])
        prec = prec + prec_11point
        
    prec = prec / len(cases.keys())
    test = test / len(cases.keys())
    for p, r in zip(prec, recal_11point):
        records.append((ngram, p, r))
    statistics[ngram] = test

df = pd.DataFrame.from_records(records, columns=["ngram", "precision_11", "recall_11"])
stats = pd.DataFrame.from_dict(statistics, orient="index", columns=["p10", "ndcg5", "mrr", "ap", "recall"])


In [18]:
stats

Unnamed: 0,p10,ndcg5,mrr,ap,recall
"(1, 1)",0.008333,0.012273,0.000772,0.005494,0.174859


In [19]:
fig = px.line(df, x="recall_11", y="precision_11", color='ngram')
fig.show()

In [20]:
myNLP = NLP(trials, cases_test, (1, 1), docs, ids)
# myNLP = NLP(trials, cases, (1, 1), docs, ids)
records_jms = []
statistics_jms = {}
matrix_corpus,matrix_documents=myNLP.JMS()
for lamb in [0.12]:
    prec = np.zeros(11,)
    test = np.zeros(5,)
    

    for x in myNLP.cases.items():
        query=myNLP.query_formating(x[1])
        results=myNLP.JSM_test(query,lamb,matrix_corpus,matrix_documents)
        p10,recall,ndcg5,ap,mrr=myNLP.eval(results,x[0])
        test = test + np.array([p10, ndcg5, mrr, ap, recall])
        prec_11point, recal_11point, tot_relv_ret = myNLP.evalPR(results, x[0])
        prec = prec + prec_11point
    
    prec = prec / len(cases.keys())
    test = test / len(cases.keys())
    for p, r in zip(prec, recal_11point):
        records_jms.append((lamb, p, r))
    statistics_jms[lamb] = test
df_jms = pd.DataFrame.from_records(records_jms, columns=["lambda", "precision_11", "recall_11"])
stats = pd.DataFrame.from_dict(statistics_jms, orient="index", columns=["p10", "ndcg5", "mrr", "ap", "recall"])

In [21]:
stats

Unnamed: 0,p10,ndcg5,mrr,ap,recall
0.12,0.008333,0.008546,0.000772,0.004503,0.174859


In [22]:
fig = px.line(df_jms, x="recall_11", y="precision_11", color='lambda')
fig.show()