In [71]:
import pandas as pd
import numpy  as np

from scipy   import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk import *
from adarank import AdaRank

In [72]:
def process(text):
    text = text.lower()
    text = text.replace('ser ' , 'serum')
    text = text.replace('plas ', 'plasma')
    text = text.replace('bld ' , 'blood')
    text = text.replace('fld ' , 'fluid')
    text = text.replace('synv ', 'synovial')
    text = text.replace('plr ' , 'pleural')
    text = text.replace('bpu ' , 'blood product unit')
    text = [word for word in text.split() if word not in corpus.stopwords.words('english')]
    return ' '.join([word for word in text])

In [73]:
queries = ["glucose in blood", "bilirubin in plasma", "white blood cells count"]

df = pd.read_excel("base_data.xlsx")
print(df.shape)
df.head(100)

(201, 9)


Unnamed: 0,Query id,loinc_num,f0,f1,f2,f3,Y,id,Y2
0,1,1003-3,Indirect antiglobulin test complement specific...,Indirect antiglobulin test complement specific...,Ser / Plas,ACnc,30,1,0
1,1,10331-7,Rh Type in Blood,Rh,Bld,Type,17,2,2
2,1,1250-0,Major crossmatch interpretation,Major crossmatch,Ser / Plas,Imp,33,3,0
3,1,13317-3,Methicillin resistant Staphylococcus aureus Pr...,Staphylococcus aureus methicillin resistant is...,XXX,ACnc,56,4,0
4,1,14423-8,Bilirubin total Mass / volume in Synovial fluid,Bilirubin,Synv fld,MCnc,58,5,0
...,...,...,...,...,...,...,...,...,...
95,2,1975-2,Bilirubin total Mass / volume in Serum or Plasma,Bilirubin,Ser / Plas,MCnc,1,29,4
96,2,1988-5,C reactive protein Mass / volume in Serum or P...,C reactive protein,Ser / Plas,MCnc,36,30,2
97,2,1994-3,Calcium ionized Moles / volume in Blood,Calcium ionized,Bld,SCnc,41,31,0
98,2,1995-0,Calcium ionized Moles / volume in Serum or Plasma,Calcium ionized,Ser / Plas,SCnc,35,32,2


In [74]:
df['f0'] = df['f0'].apply(lambda f: process(f))
df['f1'] = df['f1'].apply(lambda f: process(f))
df['f2'] = df['f2'].apply(lambda f: process(f))
df['f3'] = df['f3'].apply(lambda f: process(f))
df.head()

Unnamed: 0,Query id,loinc_num,f0,f1,f2,f3,Y,id,Y2
0,1,1003-3,indirect antiglobulin test complement specific...,indirect antiglobulin test complement specific...,serum/ plas,acnc,30,1,0
1,1,10331-7,rh type blood,rh,bld,type,17,2,2
2,1,1250-0,major crossmatch interpretation,major crossmatch,serum/ plas,imp,33,3,0
3,1,13317-3,methicillin resistant staphylococcus aureus pr...,staphylococcus aureus methicillin resistant is...,xxx,acnc,56,4,0
4,1,14423-8,bilirubin total mass / volume synovial fluid,bilirubin,synovialfld,mcnc,58,5,0


In [75]:
def get_l_norm(doc, query_word_ids):
    doc_l_norm = 0
    
    for word_id in query_word_ids:
        word_tfidf  = doc[(0,word_id)]
        doc_l_norm += pow(word_tfidf,2)
        
    doc_l_norm = np.sqrt(doc_l_norm)
    return doc_l_norm

def normalize_vector(doc, query_word_ids):
    l_norm = get_l_norm(doc, query_word_ids)

    return [doc[(0,word_id)] / l_norm if l_norm else 0 for word_id in query_word_ids]

def get_doc_cos_score(doc, query):
    return sum([doc[i] * query[i] for i in range(len(doc))]) 

In [76]:
for i in range(1, len(queries) + 1):

    query = queries[i-1]

    tfidf_f0 = TfidfVectorizer()
    tfidf_f1 = TfidfVectorizer()
    tfidf_f2 = TfidfVectorizer()
    tfidf_f3 = TfidfVectorizer()

    vectorized_f0 = tfidf_f0.fit_transform(df[df["Query id"] == i]['f0'])
    vectorized_f1 = tfidf_f1.fit_transform(df[df["Query id"] == i]['f1'])
    vectorized_f2 = tfidf_f2.fit_transform(df[df["Query id"] == i]['f2'])
    vectorized_f3 = tfidf_f3.fit_transform(df[df["Query id"] == i]['f3'])

    vectorized_features = [vectorized_f0, vectorized_f1, vectorized_f2, vectorized_f3]

    query_f0 = tfidf_f0.transform([query])
    query_f1 = tfidf_f1.transform([query])
    query_f2 = tfidf_f2.transform([query])
    query_f3 = tfidf_f3.transform([query])

    vectorized_query_per_feature = [query_f0, query_f1, query_f2, query_f3]

    for vectorized_feature, feature_vectorized_query, feature_id in zip(vectorized_features, vectorized_query_per_feature, ["f0","f1","f2","f3"]):
        query_word_ids   = feature_vectorized_query.indices
        norm_query       = normalize_vector(feature_vectorized_query, query_word_ids)
        documents_scores = [get_doc_cos_score(normalize_vector(vectorized_feature[doc_id], query_word_ids), norm_query) for doc_id in range(0,vectorized_features[0].shape[0])]
        
        df.loc[df["Query id"] == i, feature_id] = documents_scores
        
    df['score'] = df.apply(lambda x: x['f0'] + x['f1'] + x['f2'] + x['f3'], axis=1)

a = df.sort_values(by='score', ascending=False)
print(a[a['Query id'] == 2].head(67))
#df.head()

     Query id loinc_num        f0   f1   f2 f3   Y  id  Y2     score
122         2   35192-4       1.0  1.0  0.0  0   3  56   4  2.000000
93          2    1968-7       1.0  1.0  0.0  0   2  27   4  2.000000
94          2    1971-1       1.0  1.0  0.0  0   4  28   4  2.000000
95          2    1975-2       1.0  1.0  0.0  0   1  29   4  2.000000
120         2   33870-7  0.881473  1.0  0.0  0   7  54   3  1.881473
..        ...       ...       ...  ...  ... ..  ..  ..  ..       ...
114         2   26478-8       0.0  0.0  0.0  0  46  48   0  0.000000
113         2   26474-7       0.0  0.0  0.0  0  45  47   0  0.000000
112         2   26464-8       0.0  0.0  0.0  0  40  46   0  0.000000
111         2   23658-8       0.0  0.0  0.0  0  63  45   0  0.000000
109         2    2132-9       0.0  0.0  0.0  0  51  43   1  0.000000

[67 rows x 10 columns]


In [77]:
df["f0"] = pd.to_numeric(df["f0"])
df["f1"] = pd.to_numeric(df["f1"])
df["f2"] = pd.to_numeric(df["f2"])
df["f3"] = pd.to_numeric(df["f3"])
df.dtypes

Query id       int64
loinc_num     object
f0           float64
f1           float64
f2             int64
f3             int64
Y              int64
id             int64
Y2             int64
score        float64
dtype: object

In [78]:
from sklearn.model_selection import train_test_split

x1, x1_te, y1, y1_te =  train_test_split(df[df["Query id"]==1][["f0", "f1", "f2", "f3"]], df[df["Query id"]==1]["Y2"], test_size = .15)
x2, x2_te, y2, y2_te =  train_test_split(df[df["Query id"]==2][["f0", "f1", "f2", "f3"]], df[df["Query id"]==2]["Y2"], test_size = .15)
x3, x3_te, y3, y3_te =  train_test_split(df[df["Query id"]==3][["f0", "f1", "f2", "f3"]], df[df["Query id"]==3]["Y2"], test_size = .15)


In [79]:
x = pd.concat([x1,x2,x3], axis=0)
x_te = pd.concat([x1_te,x2_te,x3_te], axis=0) 
y = pd.concat([y1,y2,y3], axis=0) 
y_te = pd.concat([y1_te,y2_te,y3_te], axis=0) 

In [80]:
x_te

Unnamed: 0,f0,f1,f2,f3
39,0.0,0.0,0,0
24,0.0,0.0,0,0
41,0.0,0.0,0,0
10,0.0,0.0,0,0
48,0.57069,0.0,0,0
22,0.0,0.0,0,0
45,0.57069,0.0,0,0
31,0.0,0.0,0,0
17,0.0,0.0,0,0
2,0.0,0.0,0,0


In [81]:
qid = np.array([1]*len(x1) + [2]*len(x2) + [3]*len(x3))
y   = np.array(y)
x   = sparse.csr_matrix(np.array(x)) 

In [82]:
model = AdaRank(verbose=True)
model.fit(x,y,qid)

[1.         0.96754138 0.49855176]
[0.56041515 0.864522   0.11935176]
[0.18724606 0.29420465 0.19177948]
[0.18724606 0.26595154 0.19177948]
b {'fid': 0, 'score': array([1.        , 0.96754138, 0.49855176])}
a [1.16304901 0.         0.         0.        ]
b {'fid': 0, 'score': array([1.        , 0.96754138, 0.49855176])}


1	1.1630490068017996	0	[1.         0.96754138 0.49855176]	train 0.8220	valid 0.8220
2	1.0109723586649966	0	[1.         0.96754138 0.49855176]	train 0.8220	valid 0.8220


AdaRank(scorer=<metrics.NDCGScorer object at 0x000001E895A34580>, verbose=True)

In [None]:
qid_t = np.array([1]*len(x1_te) + [2]*len(x2_te) + [3]*len(x3_te))
y_te  = np.array(y_te)
x_te   = sparse.csr_matrix(np.array(x_te)) 

In [88]:
pred = model.predict(x_te,2) # sparse.csr_matrix(np.array(x_te))

In [91]:
y_te

array([0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 1, 2, 4, 2, 2, 0, 0, 2, 2, 0, 2,
       1, 2, 0, 0, 1, 0, 1, 0, 0, 1, 0], dtype=int64)

In [90]:
pred

array([0.        , 0.        , 0.        , 0.        , 0.66374014,
       0.        , 0.66374014, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.54923096, 1.16304901, 0.54923096,
       0.54923096, 0.        , 0.        , 0.54923096, 0.54923096,
       0.        , 0.54923096, 0.        , 1.16304901, 1.16304901,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])

In [85]:
np.argmax(pred)

13