In [65]:
import pandas as pd
import numpy  as np

from scipy   import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

import re
import nltk
from nltk import *
nltk.download('stopwords')

from adarank import AdaRank

[nltk_data] Downloading package stopwords to /home/lucas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [66]:
def process(text):
    
    if pd.isna(text):
        return ' - '
        
    text = text.lower() + ' '

    text = text.replace('/' , ' / ')
    text = text.replace('.' , ' . ')
    text = text.replace(',' , ' , ')
    text = text.replace('-' , ' - ')
    text = text.replace('^' , ' ^ ')

    text = re.sub(r'[  ]', r' ', text)
    
    text = text.replace('ser ' , 'serum ')
    text = text.replace('plas ', 'plasma ')
    text = text.replace('bld ' , 'blood ')
    text = text.replace('fld ' , 'fluid ')
    text = text.replace('synv ', 'synovial ')
    text = text.replace('plr ' , 'pleural ')
    text = text.replace('bpu ' , 'blood product unit ')

    text = [word for word in text.split() if word not in corpus.stopwords.words('english')]
    return ' '.join([word for word in text])

In [67]:
df = pd.read_excel("data_original.xlsx", header=2)
print(df.shape)
df.head(100)

(67, 5)


Unnamed: 0,loinc_num,long_common_name,component,system,property
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein,Ser/Plas,MCnc
1,1959-6,Bicarbonate [Moles/volume] in Blood,Bicarbonate,Bld,SCnc
2,10331-7,Rh [Type] in Blood,Rh,Bld,Type
3,18998-5,Trimethoprim+Sulfamethoxazole [Susceptibility],Trimethoprim+Sulfamethoxazole,Isolate,Susc
4,1975-2,Bilirubin.total [Mass/volume] in Serum or Plasma,Bilirubin,Ser/Plas,MCnc
...,...,...,...,...,...
62,54439-5,Calcium bilirubinate/Total in Stone,Calcium bilirubinate/Total,Calculus,MFr
63,18878-9,Cefazolin [Susceptibility],Cefazolin,Isolate,Susc
64,18928-2,Gentamicin [Susceptibility],Gentamicin,Isolate,Susc
65,29265-6,Calcium [Moles/volume] corrected for albumin i...,Calcium^^corrected for albumin,Ser/Plas,SCnc


In [68]:
queries = ["glucose in blood", "bilirubin in plasma", "white blood cells count"]

df = pd.read_excel("base_data_large.xlsx")[['Query id','f0','f1','f2','f3','Y']]
print(df.shape)
df.head(100)

(786, 6)


Unnamed: 0,Query id,f0,f1,f2,f3,Y
0,1,Indirect antiglobulin test complement specific...,Indirect antiglobulin test complement specific...,Ser / Plas,ACnc,0
1,1,Rh Type in Blood,Rh,Bld,Type,2
2,1,Major crossmatch interpretation,Major crossmatch,Ser / Plas,Imp,0
3,1,Methicillin resistant Staphylococcus aureus Pr...,Staphylococcus aureus methicillin resistant is...,XXX,ACnc,0
4,1,Bilirubin total Mass / volume in Synovial fluid,Bilirubin,Synv fld,MCnc,0
...,...,...,...,...,...,...
95,1,Glucose [Mass/volume] in DBS,Glucose,Bld.dot,MCnc,4
96,1,Glucose [Mass/volume] in Capillary blood by Gl...,Glucose,BldC,MCnc,4
97,1,Glucose [Moles/volume] in Capillary blood by G...,Glucose,BldC,SCnc,4
98,1,Glucose [Mass/volume] in Blood --post meal,Glucose^post meal,Bld,MCnc,4


In [69]:
df['f0'] = df['f0'].apply(lambda f: process(f))
df['f1'] = df['f1'].apply(lambda f: process(f))
df['f2'] = df['f2'].apply(lambda f: process(f))
df['f3'] = df['f3'].apply(lambda f: process(f))
df.head(100)

Unnamed: 0,Query id,f0,f1,f2,f3,Y
0,1,indirect antiglobulin test complement specific...,indirect antiglobulin test complement specific...,serum / plasma,acnc,0
1,1,rh type blood,rh,blood,type,2
2,1,major crossmatch interpretation,major crossmatch,serum / plasma,imp,0
3,1,methicillin resistant staphylococcus aureus pr...,staphylococcus aureus methicillin resistant is...,xxx,acnc,0
4,1,bilirubin total mass / volume synovial fluid,bilirubin,synovial fluid,mcnc,0
...,...,...,...,...,...,...
95,1,glucose [mass / volume] dbs,glucose,blood . dot,mcnc,4
96,1,glucose [mass / volume] capillary blood glucom...,glucose,bldc,mcnc,4
97,1,glucose [moles / volume] capillary blood gluco...,glucose,bldc,scnc,4
98,1,glucose [mass / volume] blood - - post meal,glucose ^ post meal,blood,mcnc,4


In [70]:
def get_l_norm(doc, query_word_ids):
    doc_l_norm = 0
    
    for word_id in query_word_ids:
        word_tfidf  = doc[(0,word_id)]
        doc_l_norm += pow(word_tfidf,2)
        
    doc_l_norm = np.sqrt(doc_l_norm)
    return doc_l_norm

def normalize_vector(doc, query_word_ids):
    l_norm = get_l_norm(doc, query_word_ids)
    return [doc[(0,word_id)] / l_norm if l_norm else 0 for word_id in query_word_ids]

def get_doc_cos_score(doc, query):
    return sum([doc[i] * query[i] for i in range(len(doc))]) 

In [71]:
for i in range(1, len(queries) + 1):

    query = queries[i-1]

    tfidf_f0 = TfidfVectorizer()
    tfidf_f1 = TfidfVectorizer()
    tfidf_f2 = TfidfVectorizer()
    tfidf_f3 = TfidfVectorizer()
    
    vectorized_f0 = tfidf_f0.fit_transform(df[df["Query id"] == i]['f0'])
    vectorized_f1 = tfidf_f1.fit_transform(df[df["Query id"] == i]['f1'])
    vectorized_f2 = tfidf_f2.fit_transform(df[df["Query id"] == i]['f2'])
    vectorized_f3 = tfidf_f3.fit_transform(df[df["Query id"] == i]['f3'])

    vectorized_features = [vectorized_f0, vectorized_f1, vectorized_f2, vectorized_f3]

    query_f0 = tfidf_f0.transform([query])
    query_f1 = tfidf_f1.transform([query])
    query_f2 = tfidf_f2.transform([query])
    query_f3 = tfidf_f3.transform([query])

    vectorized_query_per_feature = [query_f0, query_f1, query_f2, query_f3]

    for vectorized_feature, feature_vectorized_query, feature_id in zip(vectorized_features, vectorized_query_per_feature, ["f0","f1","f2","f3"]):
        query_word_ids   = feature_vectorized_query.indices
        norm_query       = normalize_vector(feature_vectorized_query, query_word_ids)
        documents_scores = [get_doc_cos_score(normalize_vector(vectorized_feature[doc_id], query_word_ids), norm_query) for doc_id in range(0,vectorized_features[0].shape[0])]
        
        df.loc[df["Query id"] == i, feature_id] = documents_scores
        
    df['score'] = df.apply(lambda x: x['f0'] + x['f1'] + x['f2'] + x['f3'], axis=1)


df.head()

Unnamed: 0,Query id,f0,f1,f2,f3,Y,score
0,1,0.0,0.0,0.0,0,0,0.0
1,1,0.522644,0.0,0.268347,0,2,0.790991
2,1,0.0,0.0,0.0,0,0,0.0
3,1,0.0,0.0,0.0,0,0,0.0
4,1,0.0,0.0,0.0,0,0,0.0


In [72]:
df[['Query id','f0','f1','f2','f3','Y']].head()

Unnamed: 0,Query id,f0,f1,f2,f3,Y
0,1,0.0,0.0,0.0,0,0
1,1,0.522644,0.0,0.268347,0,2
2,1,0.0,0.0,0.0,0,0
3,1,0.0,0.0,0.0,0,0
4,1,0.0,0.0,0.0,0,0


In [73]:
df["f0"] = pd.to_numeric(df["f0"])
df["f1"] = pd.to_numeric(df["f1"])
df["f2"] = pd.to_numeric(df["f2"])
df["f3"] = pd.to_numeric(df["f3"])
df.dtypes

Query id      int64
f0          float64
f1          float64
f2          float64
f3            int64
Y             int64
score       float64
dtype: object

In [74]:
from sklearn.model_selection import train_test_split

x1, x1_te, y1, y1_te =  train_test_split(df[df["Query id"]==1][["f0", "f1", "f2", "f3"]], df[df["Query id"]==1]["Y"], test_size = .15)
x2, x2_te, y2, y2_te =  train_test_split(df[df["Query id"]==2][["f0", "f1", "f2", "f3"]], df[df["Query id"]==2]["Y"], test_size = .15)
x3, x3_te, y3, y3_te =  train_test_split(df[df["Query id"]==3][["f0", "f1", "f2", "f3"]], df[df["Query id"]==3]["Y"], test_size = .15)


In [75]:
x = pd.concat([x1,x2,x3], axis=0)
x_te = pd.concat([x1_te,x2_te,x3_te], axis=0) 
y = pd.concat([y1,y2,y3], axis=0) 
y_te = pd.concat([y1_te,y2_te,y3_te], axis=0) 

In [76]:
qid = np.array([1]*len(x1) + [2]*len(x2) + [3]*len(x3))
y   = np.array(y)
x_sparse = sparse.csr_matrix(np.array(x)) 

In [77]:
print(np.unique(df['Y']))

[0 1 2 3 4]


In [78]:
model = AdaRank(verbose=True)
model.fit(x_sparse,y,qid)

1	1.635851665127633	0	[0.91241622 1.         0.86825503]	train 0.9255	valid 0.9255
2	1.614639446753552	0	[0.91241622 1.         0.86825503]	train 0.9269	valid 0.9269
3	1.6152088927304031	0	[0.91241622 1.         0.86825503]	train 0.9269	valid 0.9269


AdaRank(scorer=<metrics.NDCGScorer object at 0x7f940f7033a0>, verbose=True)

In [79]:
qid_t = np.array([1]*len(x1_te) + [2]*len(x2_te) + [3]*len(x3_te))
y_te  = np.array(y_te)
x_te_sparse = sparse.csr_matrix(np.array(x_te)) 

In [80]:
pred = model.predict(x_te_sparse, None)

In [81]:
y_te

array([0, 0, 4, 0, 0, 2, 0, 0, 4, 0, 2, 4, 2, 2, 0, 0, 2, 4, 0, 2, 2, 3,
       3, 4, 1, 2, 4, 4, 0, 4, 2, 3, 2, 3, 2, 2, 0, 0, 2, 4, 4, 4, 0, 2,
       0, 2, 0, 0, 2, 4, 0, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0, 4,
       2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 3, 2, 0, 2, 1, 1, 1, 2, 2,
       1, 0, 3, 1, 1, 2, 2, 2, 1, 2, 3, 0, 4, 0, 1, 1, 1, 4, 2, 2, 2, 2,
       2, 1, 4, 4, 4, 2, 2, 1, 2, 2])

In [82]:
pred

array([0.        , 0.        , 3.25049111, 0.        , 0.        ,
       1.69884944, 0.        , 0.        , 3.25049111, 0.        ,
       1.69884944, 3.25049111, 1.69884944, 1.69884944, 0.        ,
       0.        , 1.69884944, 3.25049111, 0.        , 1.69884944,
       1.69884944, 3.25049111, 3.25049111, 3.25049111, 0.        ,
       1.69884944, 3.25049111, 1.69884944, 0.        , 3.25049111,
       1.69884944, 1.69884944, 1.69884944, 3.25049111, 1.69884944,
       1.69884944, 0.        , 0.        , 1.69884944, 3.25049111,
       3.14717565, 3.25049111, 0.        , 1.70092691, 0.        ,
       1.70092691, 0.        , 0.        , 1.70092691, 3.14717565,
       0.        , 1.70092691, 1.70092691, 0.        , 1.70092691,
       0.        , 0.        , 1.70092691, 1.70092691, 0.        ,
       1.70092691, 0.        , 0.        , 0.        , 0.        ,
       3.25049111, 1.70092691, 3.25049111, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [83]:
x_te['Y_real'] = y_te
x_te['Y_pred'] = pred

In [84]:
x_te.sort_values(by=['Y_pred'], ascending=False)

Unnamed: 0,f0,f1,f2,f3,Y_real,Y_pred
117,1.0,0.466190,0.000000,0,3,3.250491
93,1.0,0.466190,0.268347,0,4,3.250491
88,1.0,0.466190,0.000000,0,4,3.250491
127,1.0,0.466190,0.000000,0,3,3.250491
330,1.0,0.501281,1.000000,0,4,3.250491
...,...,...,...,...,...,...
701,0.0,0.000000,0.000000,0,1,0.000000
544,0.0,0.000000,0.000000,0,0,0.000000
457,0.0,0.000000,0.000000,0,0,0.000000
703,0.0,0.000000,0.000000,0,1,0.000000


In [85]:
model.evaluate(y_te, pred, qid_t)

array([0.80958425, 1.        , 0.94672313])