In [1]:
import pandas as pd
import numpy  as np

from scipy   import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

from adarank import AdaRank
from metrics import NDCGScorer

In [None]:
def process(text):
    text = text.lower()
    text = text.replace('ser' , 'serum')
    text = text.replace('plas', 'plasma')
    text = text.replace('bld' , 'blood')
    text = text.replace('fld' , 'fluid')
    text = text.replace('synv', 'synovial')
    text = text.replace('plr' , 'pleural')
    text = text.replace('bpu' , 'blood product unit')
    return text

In [2]:
queries = ["glucose in blood", "bilirubin in plasma", "white blood cells count"]

df = pd.read_excel("base_data.xlsx")
df.head()

Unnamed: 0,Query id,loinc_num,f0,f1,f2,f3,Y
0,1,1003-3,Indirect antiglobulin test complement specific...,Indirect antiglobulin test complement specific...,Ser / Plas,ACnc,30
1,1,10331-7,Rh Type in Blood,Rh,Bld,Type,17
2,1,1250-0,Major crossmatch interpretation,Major crossmatch,Ser / Plas,Imp,33
3,1,13317-3,Methicillin resistant Staphylococcus aureus Pr...,Staphylococcus aureus methicillin resistant is...,XXX,ACnc,56
4,1,14423-8,Bilirubin total Mass / volume in Synovial fluid,Bilirubin,Synv fld,MCnc,58


In [3]:
df['f0'] = df['f0'].apply(lambda f: f.lower())
df['f1'] = df['f1'].apply(lambda f: f.lower())
df['f2'] = df['f2'].apply(lambda f: process(f))
df['f3'] = df['f3'].apply(lambda f: f.lower())
df.head()

Unnamed: 0,Query id,loinc_num,f0,f1,f2,f3,Y
0,1,1003-3,indirect antiglobulin test complement specific...,indirect antiglobulin test complement specific...,ser / plas,acnc,30
1,1,10331-7,rh type in blood,rh,bld,type,17
2,1,1250-0,major crossmatch interpretation,major crossmatch,ser / plas,imp,33
3,1,13317-3,methicillin resistant staphylococcus aureus pr...,staphylococcus aureus methicillin resistant is...,xxx,acnc,56
4,1,14423-8,bilirubin total mass / volume in synovial fluid,bilirubin,synv fld,mcnc,58


In [4]:
def get_l_norm(doc, query_word_ids):
    doc_l_norm = 0
    
    for word_id in query_word_ids:
        word_tfidf  = doc[(0,word_id)]
        doc_l_norm += pow(word_tfidf,2)
        
    doc_l_norm = np.sqrt(doc_l_norm)
    return doc_l_norm

def normalize_vector(doc, query_word_ids):
    l_norm = get_l_norm(doc, query_word_ids)

    return [doc[(0,word_id)] / l_norm if l_norm else 0 for word_id in query_word_ids]

def get_doc_cos_score(doc, query):
    return sum([doc[i] * query[i] for i in range(len(doc))]) 

In [8]:
all_scores = [[],[],[],[]]

for i in range(1, len(queries) + 1):

    feature_values = []
    
    query = queries[i-1]

    tfidf_f0 = TfidfVectorizer()
    tfidf_f1 = TfidfVectorizer()
    tfidf_f2 = TfidfVectorizer()
    tfidf_f3 = TfidfVectorizer()

    vectorized_f0 = tfidf_f0.fit_transform(df[df["Query id"] == i]['f0'])
    vectorized_f1 = tfidf_f1.fit_transform(df[df["Query id"] == i]['f1'])
    vectorized_f2 = tfidf_f2.fit_transform(df[df["Query id"] == i]['f2'])
    vectorized_f3 = tfidf_f3.fit_transform(df[df["Query id"] == i]['f3'])

    vectorized_features = [vectorized_f0, vectorized_f1, vectorized_f2, vectorized_f3]

    query_f0 = tfidf_f0.transform([query])
    query_f1 = tfidf_f1.transform([query])
    query_f2 = tfidf_f2.transform([query])
    query_f3 = tfidf_f3.transform([query])

    vectorized_query_per_feature = [query_f0, query_f1, query_f2, query_f3]

    for vectorized_feature, feature_vectorized_query in zip(vectorized_features, vectorized_query_per_feature):
        query_word_ids   = feature_vectorized_query.indices
        norm_query       = normalize_vector(feature_vectorized_query, query_word_ids)
        documents_scores = [get_doc_cos_score(normalize_vector(vectorized_feature[doc_id], query_word_ids), norm_query) for doc_id in range(0,vectorized_features[0].shape[0])]
        print(vectorized_feature.shape, feature_vectorized_query.shape, documents_scores)
        feature_values.append(documents_scores)


(67, 121) (1, 121) [0.303165525340164, 0.6226254607142058, 0.0, 0.303165525340164, 0.303165525340164, 0.6052615718447621, 0.8391941798089404, 0.8391941798089404, 0.8258937916853117, 0.8391941798089405, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.303165525340164, 0.303165525340164, 0.6226254607142058, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.6226254607142058, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.6226254607142058, 0.0, 0.6226254607142058, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.0, 0.6226254607142057, 0.6226254607142057, 0.6226254607142058, 0.6226254607142058, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.5438318936719319, 0.303165525340164, 0.8391941798089405, 0.303165525340164, 0.303165525340164, 0.303165525340164, 0.30316552534016

In [6]:
all_scores[0]

[]

In [7]:
    df[feature_id] = documents_scores    
    df.head()

NameError: name 'feature_id' is not defined

In [None]:
df

Unnamed: 0,Query id,loinc_num,f0,f1,f2,f3,Y
0,1,1003-3,indirect antiglobulin test complement specific...,indirect antiglobulin test complement specific...,ser / plas,acnc,30
1,1,10331-7,rh type in blood,rh,bld,type,17
2,1,1250-0,major crossmatch interpretation,major crossmatch,ser / plas,imp,33
3,1,13317-3,methicillin resistant staphylococcus aureus pr...,staphylococcus aureus methicillin resistant is...,xxx,acnc,56
4,1,14423-8,bilirubin total mass / volume in synovial fluid,bilirubin,synv fld,mcnc,58
...,...,...,...,...,...,...,...
196,3,883-9,abo group type in blood,abo group,bld,type,44
197,3,890-4,blood group antibody screen presence in serum ...,blood group antibody screen,ser / plas,acnc,6
198,3,925-8,blood product disposition type,blood product disposition,bpu,type,9
199,3,933-2,blood product type,blood product type,bpu,type,27


In [None]:
qid = df["Query id"].to_numpy()
y   = df["y"].to_numpy()
x   = sparse.csr_matrix(df[["f0", "f1", "f2", "f3"]].to_numpy())

KeyError: 'y'

In [None]:
model = AdaRank(max_iter=1, scorer=NDCGScorer(), verbose=True)
model.fit(x[:int(x.shape[0]*0.75)], y[:int(len(y)*0.75)] , qid[:int(len(qid)*0.75)], x[int(x.shape[0]*0.75):int(x.shape[0]*0.85)], y[int(len(y)*0.75):int(len(y)*0.85)] , qid[int(len(qid)*0.75):int(len(qid)*0.85)])

[[0.30316553 0.         0.         0.        ]
 [0.62262546 0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.30316553 0.         0.         0.        ]
 [0.30316553 0.         0.         0.        ]
 [0.60526157 0.         0.         0.        ]
 [0.83919418 0.49218971 0.         0.        ]
 [0.83919418 0.49218971 0.         0.        ]
 [0.82589379 0.49218971 0.         0.        ]
 [0.83919418 0.49218971 0.         0.        ]
 [0.30316553 0.         0.         0.        ]
 [0.30316553 0.         0.         0.        ]
 [0.30316553 0.         0.         0.        ]
 [0.30316553 0.         0.         0.        ]
 [0.30316553 0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         

1	0.6621581501536368	2	[0.57979786]	train 0.5798	valid 0.4314


AdaRank(max_iter=1, scorer=<metrics.NDCGScorer object at 0x000002CC5E8B0E50>,
        verbose=True)

In [None]:
pred= model.predict(x, qid)

In [None]:
pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])