In [1]:
import math
import random
import numpy as np
from rank_bm25 import BM25Okapi 
from collections import Counter

import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/vscode/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [19]:
def preprocess(text, stop_words, stemmer):
    """Text preprocessing pipeline"""
    tokens = nltk.word_tokenize(text.lower())
    tokens = [subword for word in tokens for subword in (word.split("/") if "/" in word else [word])]
    tokens = [t for t in tokens if t not in stop_words and t.isalnum()]
    if stemmer:
        return [stemmer.stem(t) for t in tokens]
    return tokens

def dcg(relevance_scores):
    """Compute Discounted Cumulative Gain (DCG)"""
    return np.sum([rel.relevant / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores.itertuples(), start=1)])

def ndcg(df, queries):
    """Compute Normalized Discounted Cumulative Gain (NDCG)"""

    scores = []
    for query in queries:
        retrieved = df[df["query"] == query]
        
        dcg_value = dcg(retrieved)
        ideal_relevance_scores = retrieved.sort_values(by=['relevant'], ascending=False)  # Ideal ranking
        idcg_value = dcg(ideal_relevance_scores)
    
        scores.append(dcg_value / idcg_value if idcg_value > 0 else 0.0)

    return scores

def bm25(df, queries, stop_words, stemmer):
    """Rank-BM25"""

    scores = []
    for query in queries:
        retrieved = df[df["query"] == query]
        
        tokenized_corpus = [preprocess(doc, stop_words, stemmer) for doc in retrieved['text']]
        bm25 = BM25Okapi(tokenized_corpus)
    
        scores.append(np.mean(bm25.get_scores(preprocess(query, stop_words, stemmer))))

    return scores

def calculate_precision(df, queries, first_n):
    """Calculate precision given a number of 'x' elements"""
    scores = []
    for query in queries:
        retrieved = df[df["query"] == query]

        precision_values = []
        # Five relevant categories
        for i in range(5):
            return_docs = retrieved.head(first_n)
            in_query = return_docs[return_docs["relevant"] == i]
            precision_values.append(len(in_query)/len(retrieved))
            
        scores.append(np.mean(precision_values) if precision_values else 0.0)

    return scores

def calculate_recall(df, queries, first_n):
    """Calculate precision given a number of 'x' elements"""
    scores = []
    for query in queries:
        retrieved = df[df["query"] == query]

        recall_values = []
        # Five relevant categories
        for i in range(5):
            return_docs = retrieved.head(first_n)
            in_query = return_docs[return_docs["relevant"] == i]
            recall_values.append(len(in_query)/len(df))
            
        scores.append(np.mean(recall_values) if recall_values else 0.0)

    return scores
    
def calculate_map(df, queries):
    """Calculate Mean Average Precision (MAP) is a common metric used in information retrieval to evaluate ranking systems"""
    ap_scores = []
    
    for query in queries:
        retrieved = df[df["query"] == query]
    
        precisions = []
        relevant_found = 0

        ideal_relevance_scores = retrieved.sort_values(by=['relevant'], ascending=False)  # Ideal ranking
        # Ranked is given by order of apperance
        for idx, doc in enumerate(ideal_relevance_scores.itertuples(), start=1):
            if doc.relevant:
                relevant_found += 1
                precisions.append(relevant_found / (idx + 1))
        
        ap_scores.append(np.mean(precisions) if precisions else 0.0)
    
    return ap_scores

In [4]:
file_name ="ioinc_dataset-v3.xlsx" # path to file + file name
sheet = "Sheet 1" # sheet name or sheet number or list of sheet numbers and names

df = pd.read_excel(file_name, sheet_name=sheet, header=0)
print(df.head())

  loinc_num                                   long_common_name  \
0    1988-5  C reactive protein [Mass/volume] in Serum or P...   
1    1959-6                Bicarbonate [Moles/volume] in Blood   
2   10331-7                                 Rh [Type] in Blood   
3   18998-5     Trimethoprim+Sulfamethoxazole [Susceptibility]   
4    1975-2   Bilirubin.total [Mass/volume] in Serum or Plasma   

                       component    system property             query  
0             C reactive protein  Ser/Plas     MCnc  GLUCOSE IN BLOOD  
1                    Bicarbonate       Bld     SCnc  GLUCOSE IN BLOOD  
2                             Rh       Bld     Type  GLUCOSE IN BLOOD  
3  Trimethoprim+Sulfamethoxazole   Isolate     Susc  GLUCOSE IN BLOOD  
4                      Bilirubin  Ser/Plas     MCnc  GLUCOSE IN BLOOD  


In [None]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
total_docs = len(df)

df['text'] = df.apply(lambda row: f"{row['long_common_name']}\n"
                                           f"{row['component']}, {row['system']}, {row['property']}\n---", axis=1)

random.seed(1221)
# Assigned a relevance between 0 (not relevant) and 5 (very relevant)
df["relevant"] = df.apply(lambda row: round((random.random() * 100) / 20), axis=1)
queries = df["query"].unique()
query_size = len(queries)
initialization_weight = 1/query_size
weights_matrix = np.full(query_size, initialization_weight, dtype=float)

print(calculate_map(df, queries))
print(ndcg(df, queries))
print(bm25(df, queries, stop_words, stemmer))
print(calculate_precision(df, queries, 10))
print(calculate_recall(df, queries, 10))

In [6]:
def calculate_h_score(weak_ranker_result, weight):
    return np.sum(weak_ranker_result)*weight_matrix

h1 = calculate_map(df, queries)
h2 = ndcg(df, queries))
h3 = bm25(df, queries, stop_words, stemmer)
h4 = calculate_precision(df, queries, 10)
h5 = calculate_recall(df, queries, 10)

# List of weak rankers
weak_rankers = [h1, h2, h3, h4, h5]

# Number of iterations (number of rankers to combine)
T = len(weak_rankers)

# Initialize the final ranking model
final_ranking_model = np.zeros(query_size)

# AdaRank Algorithm
for t in range(T):
    # Select the best weak ranker at this step
    weak_ranker = weak_rankers[t]
    
    # Compute weighted performance
    weighted_performance = calculate_h_score(weak_ranker, weights_matrix[])

    # Compute alpha_t (importance of this weak ranker)
    alpha_t = 0.5 * np.log(
        (np.sum(weights_matrix * (1 + weak_ranker))) /
        (np.sum(weights_matrix * (1 - weak_ranker)))
    )

    # Update the ranking model
    final_ranking_model += alpha_t * weak_ranker

    # Update weight distribution for next iteration
    weights_matrix = np.exp(-weak_ranker)  # Compute new weights
    weights_matrix /= np.sum(weights_matrix)  # Normalize to sum to 1
    

0    0
1    2
2    1
3    3
4    3
5    3
6    3
7    2
8    1
9    5
Name: relevant, dtype: int64