In [15]:
import pandas as pd
import numpy  as np

from scipy   import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

from adarank import AdaRank
from metrics import NDCGScorer

In [16]:
def process(text):
    text = text.lower()
    text = text.replace('ser' , 'serum')
    text = text.replace('plas', 'plasma')
    text = text.replace('bld' , 'blood')
    text = text.replace('fld' , 'fluid')
    text = text.replace('synv', 'synovial')
    text = text.replace('plr' , 'pleural')
    text = text.replace('bpu' , 'blood product unit')
    return text

In [17]:
queries = ["glucose in blood", "bilirubin in plasma", "white blood cells count"]

df = pd.read_excel("base_data.xlsx")
df.head()

Unnamed: 0,Query id,loinc_num,f0,f1,f2,f3,Y,id
0,1,1003-3,Indirect antiglobulin test complement specific...,Indirect antiglobulin test complement specific...,Ser / Plas,ACnc,30,1
1,1,10331-7,Rh Type in Blood,Rh,Bld,Type,17,2
2,1,1250-0,Major crossmatch interpretation,Major crossmatch,Ser / Plas,Imp,33,3
3,1,13317-3,Methicillin resistant Staphylococcus aureus Pr...,Staphylococcus aureus methicillin resistant is...,XXX,ACnc,56,4
4,1,14423-8,Bilirubin total Mass / volume in Synovial fluid,Bilirubin,Synv fld,MCnc,58,5


In [18]:
df['f0'] = df['f0'].apply(lambda f: f.lower())
df['f1'] = df['f1'].apply(lambda f: f.lower())
df['f2'] = df['f2'].apply(lambda f: process(f))
df['f3'] = df['f3'].apply(lambda f: f.lower())
df.head()

Unnamed: 0,Query id,loinc_num,f0,f1,f2,f3,Y,id
0,1,1003-3,indirect antiglobulin test complement specific...,indirect antiglobulin test complement specific...,serum / plasma,acnc,30,1
1,1,10331-7,rh type in blood,rh,blood,type,17,2
2,1,1250-0,major crossmatch interpretation,major crossmatch,serum / plasma,imp,33,3
3,1,13317-3,methicillin resistant staphylococcus aureus pr...,staphylococcus aureus methicillin resistant is...,xxx,acnc,56,4
4,1,14423-8,bilirubin total mass / volume in synovial fluid,bilirubin,synovial fluid,mcnc,58,5


In [19]:
def get_l_norm(doc, query_word_ids):
    doc_l_norm = 0
    
    for word_id in query_word_ids:
        word_tfidf  = doc[(0,word_id)]
        doc_l_norm += pow(word_tfidf,2)
        
    doc_l_norm = np.sqrt(doc_l_norm)
    return doc_l_norm

def normalize_vector(doc, query_word_ids):
    l_norm = get_l_norm(doc, query_word_ids)

    return [doc[(0,word_id)] / l_norm if l_norm else 0 for word_id in query_word_ids]

def get_doc_cos_score(doc, query):
    return sum([doc[i] * query[i] for i in range(len(doc))]) 

In [20]:
for i in range(1, len(queries) + 1):

    query = queries[i-1]

    tfidf_f0 = TfidfVectorizer()
    tfidf_f1 = TfidfVectorizer()
    tfidf_f2 = TfidfVectorizer()
    tfidf_f3 = TfidfVectorizer()

    vectorized_f0 = tfidf_f0.fit_transform(df[df["Query id"] == i]['f0'])
    vectorized_f1 = tfidf_f1.fit_transform(df[df["Query id"] == i]['f1'])
    vectorized_f2 = tfidf_f2.fit_transform(df[df["Query id"] == i]['f2'])
    vectorized_f3 = tfidf_f3.fit_transform(df[df["Query id"] == i]['f3'])

    vectorized_features = [vectorized_f0, vectorized_f1, vectorized_f2, vectorized_f3]

    query_f0 = tfidf_f0.transform([query])
    query_f1 = tfidf_f1.transform([query])
    query_f2 = tfidf_f2.transform([query])
    query_f3 = tfidf_f3.transform([query])

    vectorized_query_per_feature = [query_f0, query_f1, query_f2, query_f3]

    for vectorized_feature, feature_vectorized_query, feature_id in zip(vectorized_features, vectorized_query_per_feature,["f0","f1","f2","f3"]):
        query_word_ids   = feature_vectorized_query.indices
        norm_query       = normalize_vector(feature_vectorized_query, query_word_ids)
        documents_scores = [get_doc_cos_score(normalize_vector(vectorized_feature[doc_id], query_word_ids), norm_query) for doc_id in range(0,vectorized_features[0].shape[0])]
        
        df.loc[df["Query id"] == i, feature_id] = documents_scores
        
df.head()

Unnamed: 0,Query id,loinc_num,f0,f1,f2,f3,Y,id
0,1,1003-3,0.303166,0.0,0.0,0,30,1
1,1,10331-7,0.622625,0.0,1.0,0,17,2
2,1,1250-0,0.0,0.0,0.0,0,33,3
3,1,13317-3,0.303166,0.0,0.0,0,56,4
4,1,14423-8,0.303166,0.0,0.0,0,58,5


In [21]:
df["f0"] = pd.to_numeric(df["f0"])
df["f1"] = pd.to_numeric(df["f1"])
df["f2"] = pd.to_numeric(df["f2"])
df["f3"] = pd.to_numeric(df["f3"])
df.dtypes

Query id       int64
loinc_num     object
f0           float64
f1           float64
f2           float64
f3             int64
Y              int64
id             int64
dtype: object

In [22]:
from sklearn.model_selection import train_test_split

x1, x1_te, y1, y1_te =  train_test_split(df[df["Query id"]==1][["f0", "f1", "f2", "f3"]], df[df["Query id"]==1]["Y"], test_size = .15)
x2, x2_te, y2, y2_te =  train_test_split(df[df["Query id"]==2][["f0", "f1", "f2", "f3"]], df[df["Query id"]==2]["Y"], test_size = .15)
x3, x3_te, y3, y3_te =  train_test_split(df[df["Query id"]==3][["f0", "f1", "f2", "f3"]], df[df["Query id"]==3]["Y"], test_size = .15)


In [23]:
x = x1.append(x2).append(x3)
x_te = x1_te.append(x2_te).append(x3_te)
y = y1.append(y2).append(y3)
y1te = y1_te.append(y2_te).append(y3_te)

  x = x1.append(x2).append(x3)
  x = x1.append(x2).append(x3)
  x_te = x1_te.append(x2_te).append(x3_te)
  x_te = x1_te.append(x2_te).append(x3_te)
  y = y1.append(y2).append(y3)
  y = y1.append(y2).append(y3)
  y1te = y1_te.append(y2_te).append(y3_te)
  y1te = y1_te.append(y2_te).append(y3_te)


In [25]:
qid = np.array([1]*(len(x)//3) + [2]*(len(x)//3) + [3]*(len(x)//3))
y   = np.array(x["f0"])
x   = sparse.csr_matrix(np.array(x)) #  sparse.csr_matrix()

In [13]:
model = AdaRank()
model.fit(x,y,qid,x,y,qid)

f <class 'numpy.ndarray'>
0.303165525340164
0.303165525340164
init
y_true [0.30316553 0.         0.30316553 0.62262546 0.         0.30316553
 0.62262546 0.30316553 0.83919418 0.30316553 0.62262546 0.62262546
 0.30316553 0.30316553 0.30316553 0.60526157 0.30316553 0.30316553
 0.30316553 0.30316553 0.         0.54383189 0.62262546 0.
 0.         0.         0.54383189 0.62262546 0.30316553 0.30316553
 0.30316553 0.30316553 0.54383189 0.83919418 0.30316553 0.30316553
 0.62262546 0.30316553 0.54383189 0.83919418 0.30316553 0.30316553
 0.         0.62262546 0.30316553 0.30316553 0.30316553 0.
 1.         0.30316553 0.83919418 0.62262546 0.30316553 0.
 0.         0.        ]
order [48 33 39 50  8 27 36 22 11 43  6  3 10 51 15 26 21 38 32 49 46 37 41 44
 35 34 45 52 40 31  0 29  2  5  7  9 12 13 30 16 14 18 19 28 17 42 20 23
 24 47 25 54  4  1 53 55]
y_true [1.         0.83919418 0.83919418 0.83919418 0.83919418 0.62262546
 0.62262546 0.62262546 0.62262546 0.62262546]
gain [1.         0.789050

  h['alpha'] = 0.5 * (math.log(np.dot(weights, 1 + h['score']) /


AdaRank(scorer=<metrics.NDCGScorer object at 0x00000289D3650B80>)

In [14]:
qid = np.array([1]*(len(x)//3) + [2]*(len(x)//3) + [3]*(len(x)//3))
pred= model.predict(x[int(x.shape[0]*0.85):], qid[int(x.shape[0]*0.85):])

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [45]:
pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.])