In [None]:
import pandas as pd
import numpy  as np

from scipy   import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

import re
import nltk
from nltk import *
nltk.download('stopwords')

from adarank import AdaRank

In [None]:
def process(text):
    
    if pd.isna(text):
        return ' - '
        
    text = text.lower() + ' '

    text = text.replace('/' , ' / ')
    text = text.replace('.' , ' . ')
    text = text.replace(',' , ' , ')
    text = text.replace('-' , ' - ')
    text = text.replace('^' , ' ^ ')

    text = re.sub(r'[  ]', r' ', text)
    
    text = text.replace('ser ' , 'serum ')
    text = text.replace('plas ', 'plasma ')
    text = text.replace('bld ' , 'blood ')
    text = text.replace('fld ' , 'fluid ')
    text = text.replace('synv ', 'synovial ')
    text = text.replace('plr ' , 'pleural ')
    text = text.replace('bpu ' , 'blood product unit ')

    text = [word for word in text.split() if word not in corpus.stopwords.words('english')]
    return ' '.join([word for word in text])

In [None]:
queries = ["glucose in blood", "bilirubin in plasma", "white blood cells count"]

df = pd.read_excel("base_data.xlsx")[['Query id','f0','f1','f2','f3','Y']]
df.head()

In [None]:
df['f0'] = df['f0'].apply(lambda f: process(f))
df['f1'] = df['f1'].apply(lambda f: process(f))
df['f2'] = df['f2'].apply(lambda f: process(f))
df['f3'] = df['f3'].apply(lambda f: process(f))
df.head()

In [None]:
def get_l_norm(doc, query_word_ids):
    doc_l_norm = 0
    
    for word_id in query_word_ids:
        word_tfidf  = doc[(0,word_id)]
        doc_l_norm += pow(word_tfidf,2)
        
    doc_l_norm = np.sqrt(doc_l_norm)
    return doc_l_norm

def normalize_vector(doc, query_word_ids):
    l_norm = get_l_norm(doc, query_word_ids)
    return [doc[(0,word_id)] / l_norm if l_norm else 0 for word_id in query_word_ids]

def get_doc_cos_score(doc, query):
    return sum([doc[i] * query[i] for i in range(len(doc))]) 

In [None]:
for i in range(1, len(queries) + 1):

    query = queries[i-1]

    tfidf_f0 = TfidfVectorizer()
    tfidf_f1 = TfidfVectorizer()
    tfidf_f2 = TfidfVectorizer()
    tfidf_f3 = TfidfVectorizer()
    
    vectorized_f0 = tfidf_f0.fit_transform(df[df["Query id"] == i]['f0'])
    vectorized_f1 = tfidf_f1.fit_transform(df[df["Query id"] == i]['f1'])
    vectorized_f2 = tfidf_f2.fit_transform(df[df["Query id"] == i]['f2'])
    vectorized_f3 = tfidf_f3.fit_transform(df[df["Query id"] == i]['f3'])

    vectorized_features = [vectorized_f0, vectorized_f1, vectorized_f2, vectorized_f3]

    query_f0 = tfidf_f0.transform([query])
    query_f1 = tfidf_f1.transform([query])
    query_f2 = tfidf_f2.transform([query])
    query_f3 = tfidf_f3.transform([query])

    vectorized_query_per_feature = [query_f0, query_f1, query_f2, query_f3]

    for vectorized_feature, feature_vectorized_query, feature_id in zip(vectorized_features, vectorized_query_per_feature, ["f0","f1","f2","f3"]):
        query_word_ids   = feature_vectorized_query.indices
        norm_query       = normalize_vector(feature_vectorized_query, query_word_ids)
        documents_scores = [get_doc_cos_score(normalize_vector(vectorized_feature[doc_id], query_word_ids), norm_query) for doc_id in range(0,vectorized_features[0].shape[0])]
        
        df.loc[df["Query id"] == i, feature_id] = documents_scores

df.head()

In [None]:
df["f0"] = pd.to_numeric(df["f0"])
df["f1"] = pd.to_numeric(df["f1"])
df["f2"] = pd.to_numeric(df["f2"])
df["f3"] = pd.to_numeric(df["f3"])

In [None]:
from sklearn.model_selection import train_test_split

x1, x1_te, y1, y1_te =  train_test_split(df[df["Query id"]==1][["Query id", "f0", "f1", "f2", "f3"]], df[df["Query id"]==1]["Y"], test_size = .15)
x2, x2_te, y2, y2_te =  train_test_split(df[df["Query id"]==2][["Query id", "f0", "f1", "f2", "f3"]], df[df["Query id"]==2]["Y"], test_size = .15)
x3, x3_te, y3, y3_te =  train_test_split(df[df["Query id"]==3][["Query id", "f0", "f1", "f2", "f3"]], df[df["Query id"]==3]["Y"], test_size = .15)


In [None]:
x = pd.concat([x1,x2,x3], axis=0)
x_te = pd.concat([x1_te,x2_te,x3_te], axis=0) 
y = pd.concat([y1,y2,y3], axis=0) 
y_te = pd.concat([y1_te,y2_te,y3_te], axis=0) 

In [None]:
qid = np.array([1]*len(x1) + [2]*len(x2) + [3]*len(x3))
y   = np.array(y)
x_sparse = sparse.csr_matrix(np.array(x[["f0", "f1", "f2", "f3"]])) 

In [None]:
model = AdaRank(verbose=True)
model.fit(x_sparse,y,qid)

In [None]:
qid_t = np.array([1]*len(x1_te) + [2]*len(x2_te) + [3]*len(x3_te))
y_te  = np.array(y_te)
x_te_sparse = sparse.csr_matrix(np.array(x_te[["f0", "f1", "f2", "f3"]])) 

In [None]:
pred = model.predict(x_te_sparse, None)
print("The NDGC for each of the queries is respectively:",model.evaluate(y_te, pred, qid_t))

In [None]:
mae_general = 0
n_general   = 0

mae_values = [0,0,0,0,0]
n_values   = [0,0,0,0,0]

for real_val, pred_val in zip(y_te, pred):
    mae_values[real_val] += abs(real_val - pred_val)
    n_values[real_val]   += 1

for i, (mae, n) in enumerate(zip(mae_values, n_values)):
    print("The MAE for queries with rank",i,"is:")
    if n == 0:
        print("No test queries with that rank")
    else:
        print(mae / n)
        mae_general += mae / n
        n_general   += 1
    
print("The overall MAE is:")
print(mae_general/n_general)

In [None]:
mse_general = 0
n_general   = 0

mse_values = [0,0,0,0,0]
n_values   = [0,0,0,0,0]

for real_val, pred_val in zip(y_te, pred):
    mse_values[real_val] += pow(real_val - pred_val, 2)
    n_values[real_val]   += 1

for i, (mse, n) in enumerate(zip(mse_values, n_values)):
    print("The MSE for queries with rank",i,"is:")
    if n == 0:
        print("No test queries with that rank")
    else:
        print(mse / n)
        mse_general += mse / n
        n_general   += 1
    
print("The overall MSE is:")
print(mse_general/n_general)

In [None]:
x_te['Y_real'] = y_te
x_te['Y_pred'] = pred

In [None]:
x_te[x_te["Query id"]==1].sort_values(by=['Y_pred'], ascending=False).head(20)

In [None]:
x_te[x_te["Query id"]==2].sort_values(by=['Y_pred'], ascending=False).head(20)

In [None]:
x_te[x_te["Query id"]==3].sort_values(by=['Y_pred'], ascending=False).head(20)