In [23]:
import math
import random
import numpy as np
from collections import Counter

import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/vscode/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [110]:
def preprocess(text, stop_words, stemmer):
    """Text preprocessing pipeline"""
    tokens = nltk.word_tokenize(text.lower())
    tokens = [subword for word in tokens for subword in (word.split("/") if "/" in word else [word])]
    tokens = [t for t in tokens if t not in stop_words and t.isalnum()]
    if stemmer:
        return [stemmer.stem(t) for t in tokens]
    return tokens

def dcg(relevance_scores):
    """Compute Discounted Cumulative Gain (DCG)"""
    return np.sum([rel.relevant / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores.itertuples(), start=1)])

def ndcg(df, queries):
    """Compute Normalized Discounted Cumulative Gain (NDCG)"""

    scores = []
    for query in queries:
        retrieved = df[df["query"] == query]
        
        dcg_value = dcg(retrieved)
        ideal_relevance_scores = retrieved.sort_values(by=['relevant'], ascending=False)  # Ideal ranking
        idcg_value = dcg(ideal_relevance_scores)
    
        scores.append(dcg_value / idcg_value if idcg_value > 0 else 0.0)

    return np.mean(scores) if scores else 0.0
   
def calculate_map(df, queries):
    """Calculate Mean Average Precision (MAP) is a common metric used in information retrieval to evaluate ranking systems"""
    ap_scores = []
    
    for query in queries:
        retrieved = df[df["query"] == query]
    
        precisions = []
        relevant_found = 0

        # Ranked is given by order of apperance
        for idx, doc in enumerate(retrieved.itertuples(), start=1):
            if doc.relevant:
                relevant_found += 1
                precisions.append(relevant_found / (idx + 1))
        
        ap_scores.append(np.mean(precisions) if precisions else 0.0)
    
    return np.mean(ap_scores) if ap_scores else 0.0

In [4]:
file_name ="ioinc_dataset-v3.xlsx" # path to file + file name
sheet = "Sheet 1" # sheet name or sheet number or list of sheet numbers and names

df = pd.read_excel(file_name, sheet_name=sheet, header=0)
print(df.head())

  loinc_num                                   long_common_name  \
0    1988-5  C reactive protein [Mass/volume] in Serum or P...   
1    1959-6                Bicarbonate [Moles/volume] in Blood   
2   10331-7                                 Rh [Type] in Blood   
3   18998-5     Trimethoprim+Sulfamethoxazole [Susceptibility]   
4    1975-2   Bilirubin.total [Mass/volume] in Serum or Plasma   

                       component    system property             query  
0             C reactive protein  Ser/Plas     MCnc  GLUCOSE IN BLOOD  
1                    Bicarbonate       Bld     SCnc  GLUCOSE IN BLOOD  
2                             Rh       Bld     Type  GLUCOSE IN BLOOD  
3  Trimethoprim+Sulfamethoxazole   Isolate     Susc  GLUCOSE IN BLOOD  
4                      Bilirubin  Ser/Plas     MCnc  GLUCOSE IN BLOOD  


In [111]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
total_docs = len(df)

df['text'] = df.apply(lambda row: f"{row['long_common_name']}\n"
                                           f"{row['component']}, {row['system']}, {row['property']}\n---", axis=1)

random.seed(1221)
# Assigned a relevance between 0 (weight 80%) and 1 (weight 20%)
df["relevant"] = df.apply(lambda row: random.random() < 0.20, axis=1)
queries = ["GLUCOSE IN BLOOD", "BILIRUBIN IN PLASMA", "WHITE BLOOD CELLS COUNT"]

print(calculate_map(df, queries))
print(ndcg(df, queries))

0.184889999749017
0.5990864192572847
