In [71]:
import pandas as pd
import numpy  as np

from scipy   import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk import *
from adarank import AdaRank

In [72]:
def process(text):
    text = text.lower()
    text = text.replace('ser ' , 'serum')
    text = text.replace('plas ', 'plasma')
    text = text.replace('bld ' , 'blood')
    text = text.replace('fld ' , 'fluid')
    text = text.replace('synv ', 'synovial')
    text = text.replace('plr ' , 'pleural')
    text = text.replace('bpu ' , 'blood product unit')
    text = [word for word in text.split() if word not in corpus.stopwords.words('english')]
    return ' '.join([word for word in text])

In [73]:
queries = ["glucose in blood", "bilirubin in plasma", "white blood cells count"]

df = pd.read_excel("base_data.xlsx")
print(df.shape)
df.head(100)

(201, 8)


Unnamed: 0,Query id,loinc_num,f0,f1,f2,f3,Y,id
0,1,1003-3,Indirect antiglobulin test complement specific...,Indirect antiglobulin test complement specific...,Ser / Plas,ACnc,30,1
1,1,10331-7,Rh Type in Blood,Rh,Bld,Type,17,2
2,1,1250-0,Major crossmatch interpretation,Major crossmatch,Ser / Plas,Imp,33,3
3,1,13317-3,Methicillin resistant Staphylococcus aureus Pr...,Staphylococcus aureus methicillin resistant is...,XXX,ACnc,56,4
4,1,14423-8,Bilirubin total Mass / volume in Synovial fluid,Bilirubin,Synv fld,MCnc,58,5
...,...,...,...,...,...,...,...,...
95,2,1975-2,Bilirubin total Mass / volume in Serum or Plasma,Bilirubin,Ser / Plas,MCnc,1,29
96,2,1988-5,C reactive protein Mass / volume in Serum or P...,C reactive protein,Ser / Plas,MCnc,36,30
97,2,1994-3,Calcium ionized Moles / volume in Blood,Calcium ionized,Bld,SCnc,41,31
98,2,1995-0,Calcium ionized Moles / volume in Serum or Plasma,Calcium ionized,Ser / Plas,SCnc,35,32


In [74]:
df['f0'] = df['f0'].apply(lambda f: process(f))
df['f1'] = df['f1'].apply(lambda f: process(f))
df['f2'] = df['f2'].apply(lambda f: process(f))
df['f3'] = df['f3'].apply(lambda f: process(f))
df.head()

Unnamed: 0,Query id,loinc_num,f0,f1,f2,f3,Y,id
0,1,1003-3,indirect antiglobulin test complement specific...,indirect antiglobulin test complement specific...,serum/ plas,acnc,30,1
1,1,10331-7,rh type blood,rh,bld,type,17,2
2,1,1250-0,major crossmatch interpretation,major crossmatch,serum/ plas,imp,33,3
3,1,13317-3,methicillin resistant staphylococcus aureus pr...,staphylococcus aureus methicillin resistant is...,xxx,acnc,56,4
4,1,14423-8,bilirubin total mass / volume synovial fluid,bilirubin,synovialfld,mcnc,58,5


In [75]:
def get_l_norm(doc, query_word_ids):
    doc_l_norm = 0
    
    for word_id in query_word_ids:
        word_tfidf  = doc[(0,word_id)]
        doc_l_norm += pow(word_tfidf,2)
        
    doc_l_norm = np.sqrt(doc_l_norm)
    return doc_l_norm

def normalize_vector(doc, query_word_ids):
    l_norm = get_l_norm(doc, query_word_ids)

    return [doc[(0,word_id)] / l_norm if l_norm else 0 for word_id in query_word_ids]

def get_doc_cos_score(doc, query):
    return sum([doc[i] * query[i] for i in range(len(doc))]) 

In [76]:
for i in range(1, len(queries) + 1):

    query = queries[i-1]

    tfidf_f0 = TfidfVectorizer()
    tfidf_f1 = TfidfVectorizer()
    tfidf_f2 = TfidfVectorizer()
    tfidf_f3 = TfidfVectorizer()

    vectorized_f0 = tfidf_f0.fit_transform(df[df["Query id"] == i]['f0'])
    vectorized_f1 = tfidf_f1.fit_transform(df[df["Query id"] == i]['f1'])
    vectorized_f2 = tfidf_f2.fit_transform(df[df["Query id"] == i]['f2'])
    vectorized_f3 = tfidf_f3.fit_transform(df[df["Query id"] == i]['f3'])

    vectorized_features = [vectorized_f0, vectorized_f1, vectorized_f2, vectorized_f3]

    query_f0 = tfidf_f0.transform([query])
    query_f1 = tfidf_f1.transform([query])
    query_f2 = tfidf_f2.transform([query])
    query_f3 = tfidf_f3.transform([query])

    vectorized_query_per_feature = [query_f0, query_f1, query_f2, query_f3]

    for vectorized_feature, feature_vectorized_query, feature_id in zip(vectorized_features, vectorized_query_per_feature, ["f0","f1","f2","f3"]):
        query_word_ids   = feature_vectorized_query.indices
        norm_query       = normalize_vector(feature_vectorized_query, query_word_ids)
        documents_scores = [get_doc_cos_score(normalize_vector(vectorized_feature[doc_id], query_word_ids), norm_query) for doc_id in range(0,vectorized_features[0].shape[0])]
        
        df.loc[df["Query id"] == i, feature_id] = documents_scores
        
    df['score'] = df.apply(lambda x: x['f0'] + x['f1'] + x['f2'] + x['f3'], axis=1)

a = df.sort_values(by='score', ascending=False)
print(a[a['Query id'] == 2].head(67))
#df.head()

     Query id loinc_num        f0   f1   f2 f3   Y  id     score
122         2   35192-4       1.0  1.0  0.0  0   3  56  2.000000
93          2    1968-7       1.0  1.0  0.0  0   2  27  2.000000
94          2    1971-1       1.0  1.0  0.0  0   4  28  2.000000
95          2    1975-2       1.0  1.0  0.0  0   1  29  2.000000
120         2   33870-7  0.881473  1.0  0.0  0   7  54  1.881473
..        ...       ...       ...  ...  ... ..  ..  ..       ...
114         2   26478-8       0.0  0.0  0.0  0  46  48  0.000000
113         2   26474-7       0.0  0.0  0.0  0  45  47  0.000000
112         2   26464-8       0.0  0.0  0.0  0  40  46  0.000000
111         2   23658-8       0.0  0.0  0.0  0  63  45  0.000000
109         2    2132-9       0.0  0.0  0.0  0  51  43  0.000000

[67 rows x 9 columns]


In [77]:
df["f0"] = pd.to_numeric(df["f0"])
df["f1"] = pd.to_numeric(df["f1"])
df["f2"] = pd.to_numeric(df["f2"])
df["f3"] = pd.to_numeric(df["f3"])
df.dtypes

Query id       int64
loinc_num     object
f0           float64
f1           float64
f2             int64
f3             int64
Y              int64
id             int64
score        float64
dtype: object

In [78]:
from sklearn.model_selection import train_test_split

x1, x1_te, y1, y1_te =  train_test_split(df[df["Query id"]==1][["f0", "f1", "f2", "f3"]], df[df["Query id"]==1]["Y"], test_size = .15)
x2, x2_te, y2, y2_te =  train_test_split(df[df["Query id"]==2][["f0", "f1", "f2", "f3"]], df[df["Query id"]==2]["Y"], test_size = .15)
x3, x3_te, y3, y3_te =  train_test_split(df[df["Query id"]==3][["f0", "f1", "f2", "f3"]], df[df["Query id"]==3]["Y"], test_size = .15)


In [79]:
x = x1.append(x2).append(x3)
x_te = x1_te.append(x2_te).append(x3_te)
y = y1.append(y2).append(y3)
y_te = y1_te.append(y2_te).append(y3_te)

  x = x1.append(x2).append(x3)
  x = x1.append(x2).append(x3)
  x_te = x1_te.append(x2_te).append(x3_te)
  x_te = x1_te.append(x2_te).append(x3_te)
  y = y1.append(y2).append(y3)
  y = y1.append(y2).append(y3)
  y_te = y1_te.append(y2_te).append(y3_te)
  y_te = y1_te.append(y2_te).append(y3_te)


In [80]:
qid = np.array([1]*len(x1) + [2]*len(x2) + [3]*len(x3))
y   = np.array(y)
x   = sparse.csr_matrix(np.array(x)) 

In [81]:
print(x.shape, len(y), len(qid))

(168, 4) 168 168


In [82]:
model = AdaRank(verbose=True)
model.fit(x,y,qid)

[4.09513064e-13 3.77610221e-09 4.75488626e-05]
[1.10546449e-07 8.20702721e-03 2.65471464e-02]
[0.07757165 0.51691739 0.24096337]
[0.07757165 0.80689409 0.24096337]
b {'fid': 3, 'score': array([0.07757165, 0.80689409, 0.24096337])}
a [0.         0.         0.         0.39439513]
b {'fid': 3, 'score': array([0.07757165, 0.80689409, 0.24096337])}


1	0.3943951297014576	3	[0.07757165 0.80689409 0.24096337]	train 0.3751	valid 0.3751
2	0.2963159960532612	3	[0.07757165 0.80689409 0.24096337]	train 0.3751	valid 0.3751


AdaRank(scorer=<metrics.NDCGScorer object at 0x00000176C63F49D0>, verbose=True)

In [86]:
qid_t = np.array([1]*len(x1_te) + [2]*len(x2_te) + [3]*len(x3_te))
pred = model.predict(x,1) # sparse.csr_matrix(np.array(x_te))

[0.         0.         0.         0.39439513]


In [84]:
pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])