In [1]:
import math
import random
import numpy as np
from rank_bm25 import BM25Okapi 
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/vscode/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
def preprocess(text, stop_words, stemmer):
    """Text preprocessing pipeline"""
    tokens = nltk.word_tokenize(text.lower())
    tokens = [subword for word in tokens for subword in (word.split("/") if "/" in word else [word])]
    tokens = [t for t in tokens if t not in stop_words and t.isalnum()]
    if stemmer:
        return [stemmer.stem(t) for t in tokens]
    return tokens

def dcg(relevance_scores):
    """Compute Discounted Cumulative Gain (DCG)"""
    return np.sum([rel.relevant / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores.itertuples(), start=1)])

def ndcg(df, queries):
    """Compute Normalized Discounted Cumulative Gain (NDCG)"""

    scores = []
    for query in queries:
        retrieved = df[df["query"] == query]
        
        dcg_value = dcg(retrieved)
        ideal_relevance_scores = retrieved.sort_values(by=['relevant'], ascending=False)  # Ideal ranking
        idcg_value = dcg(ideal_relevance_scores)

        score = dcg_value / idcg_value if idcg_value > 0 else 0.0
        df.loc[df["query"] == query, "ndcg"] = score
        scores.append(score)
        
    return scores

def bm25(df, queries, stop_words, stemmer):
    """Rank-BM25"""

    scores = []
    for query in queries:
        retrieved = df[df["query"] == query]
        
        tokenized_corpus = [preprocess(doc, stop_words, stemmer) for doc in retrieved['text']]
        bm25 = BM25Okapi(tokenized_corpus)

        score = np.mean(bm25.get_scores(preprocess(query, stop_words, stemmer)))
        df.loc[df["query"] == query, "bm25"] = score
        scores.append(score)

    return scores

def calculate_precision(df, queries, first_n):
    """Calculate precision given a number of 'x' elements"""
    scores = []
    for query in queries:
        retrieved = df[df["query"] == query]

        precision_values = []
        # Five relevant categories
        for i in range(5):
            return_docs = retrieved.head(first_n)
            in_query = return_docs[return_docs["relevant"] == i]
            precision_values.append(len(in_query)/len(retrieved))

        score = np.mean(precision_values) if precision_values else 0.0
        df.loc[df["query"] == query, "precision"] = score
        scores.append(score)

    return scores

def calculate_recall(df, queries, first_n):
    """Calculate precision given a number of 'x' elements"""
    scores = []
    for query in queries:
        retrieved = df[df["query"] == query]

        recall_values = []
        # Five relevant categories
        for i in range(5):
            return_docs = retrieved.head(first_n)
            in_query = return_docs[return_docs["relevant"] == i]
            recall_values.append(len(in_query)/len(df))

        score = np.mean(recall_values) if recall_values else 0.0
        df.loc[df["query"] == query, "recall"] = score
        scores.append(score)

    return scores
    
def calculate_map(df, queries):
    """Calculate Mean Average Precision (MAP) is a common metric used in information retrieval to evaluate ranking systems"""
    ap_scores = []
    
    for query in queries:
        retrieved = df[df["query"] == query]
    
        precisions = []
        relevant_found = 0

        ideal_relevance_scores = retrieved.sort_values(by=['relevant'], ascending=False)  # Ideal ranking
        # Ranked is given by order of apperance
        for idx, doc in enumerate(ideal_relevance_scores.itertuples(), start=1):
            if doc.relevant:
                relevant_found += 1
                precisions.append(relevant_found / (idx + 1))

        ap_score = np.mean(precisions) if precisions else 0.0
        df.loc[df["query"] == query, "map"] = ap_score
        ap_scores.append(ap_score)
    
    return ap_scores

In [4]:
file_name ="ioinc_dataset-v3.xlsx" # path to file + file name
sheet = "Sheet 1" # sheet name or sheet number or list of sheet numbers and names

df = pd.read_excel(file_name, sheet_name=sheet, header=0)
print(df.head())

  loinc_num                                   long_common_name  \
0    1988-5  C reactive protein [Mass/volume] in Serum or P...   
1    1959-6                Bicarbonate [Moles/volume] in Blood   
2   10331-7                                 Rh [Type] in Blood   
3   18998-5     Trimethoprim+Sulfamethoxazole [Susceptibility]   
4    1975-2   Bilirubin.total [Mass/volume] in Serum or Plasma   

                       component    system property             query  
0             C reactive protein  Ser/Plas     MCnc  GLUCOSE IN BLOOD  
1                    Bicarbonate       Bld     SCnc  GLUCOSE IN BLOOD  
2                             Rh       Bld     Type  GLUCOSE IN BLOOD  
3  Trimethoprim+Sulfamethoxazole   Isolate     Susc  GLUCOSE IN BLOOD  
4                      Bilirubin  Ser/Plas     MCnc  GLUCOSE IN BLOOD  


In [5]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
total_docs = len(df)

df['text'] = df.apply(lambda row: f"{row['long_common_name']}\n"
                                           f"{row['component']}, {row['system']}, {row['property']}\n---", axis=1)
# random.seed(1221)
# Assigned a relevance between 0 (not relevant) and 5 (very relevant)
# df["relevant"] = df.apply(lambda row: round((random.random() * 100) / 20), axis=1)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["text"])
query_vectors = vectorizer.transform(df["query"])

# Compute cosine similarity
similarities = [cosine_similarity(query_vectors[i], X[i]).flatten()[0] for i in range(len(df))]

# Add similarity scores to DataFrame
df["similarities"] = similarities

# Assign relevance based on TfIDF cosine similarity (0 to 5)
df["relevant"] = pd.cut(df["similarities"], bins=6, labels=[0, 1, 2, 3, 4, 5], include_lowest=True)

queries = df["query"].unique()
query_size = len(queries)
initialization_weight = 1/query_size
weights_matrix = np.full(query_size, initialization_weight, dtype=float)

In [6]:
def compute_features(df, query, stop_words, stemmer):
    h1 = calculate_map(df, queries)
    h2 = ndcg(df, queries)
    h3 = bm25(df, queries, stop_words, stemmer)
    h4 = calculate_precision(df, queries, 10)
    h5 = calculate_recall(df, queries, 10)

    # Stack feature vectors per document
    return [h1,h2,h3,h4,h5]

In [7]:
# List of weak rankers
weak_rankers = compute_features(df, queries, stop_words, stemmer)

# Number of iterations (number of rankers to combine)
T = len(weak_rankers)

# Initialize the final ranking model
final_ranking_model = np.zeros(query_size)

def train_model(T, weak_rankers, weights_matrix, final_ranking_model, df, epoch):
    # AdaRank Algorithm
    for t in range(T):
        weak_ranker = np.array(weak_rankers[t])

        # Compute alpha_t (importance of this weak ranker)
        numerator = np.sum(weights_matrix * (1 + weak_ranker))
        denominator = np.sum(weights_matrix * (1 - weak_ranker))
    
        if denominator == 0:  # Avoid division by zero
            alpha_t = 0  # If denominator is zero, weak ranker is irrelevant
        else:
            alpha_t = 0.5 * np.log(numerator / denominator)
    
        # Update the ranking model
        final_ranking_model += alpha_t * weak_ranker  # Correct way to combine weak rankers
    
        # Update weight distribution for the next iteration
        weight_update = np.exp(-weak_ranker)  # Compute exponentiation for E-based weight update
        weights_matrix = weight_update / np.sum(weight_update)  # Normalize to ensure sum=1

    for idx, feature in enumerate(["MAP", "NDCG", "BM25", "Precision", "Recall"]):
        df[f"{feature}_Weight_Epoch_{epoch}"] = final_ranking_model[idx]
    
    return weights_matrix

# Train for 10 epochs
for i in range(10):
    weights_matrix = train_model(T, weak_rankers, weights_matrix, final_ranking_model, df, i)

In [8]:
# Final ranking function
def final_ranker(query_index):
    return final_ranking_model[query_index]

# Example usage
print("Final ranking weight scores:", final_ranking_model)

Final ranking weight scores: [11.10398833  9.53828903  1.93831303 16.60134507 11.93239951]


In [12]:
# Export results
df.to_excel("output.xlsx") 

In [9]:
def rank_new_query(features, trained_model):
    """Rates how similar a given set of documents to a query"""
    scores = np.dot(features, trained_model)  # Rank docs using AdaRank weights
    return scores

In [10]:
# Example implementation 

documents = [
    "Machine learning is a method of data analysis that automates analytical model building.",
    "Artificial intelligence is intelligence demonstrated by machines, as opposed to human intelligence.",
    "The capital of France is Paris.",
    "Deep learning is a subset of machine learning that uses neural networks."
    "blah"
]
query = "AI and deep learning are part of machine learning."

df_test = pd.DataFrame({"text": documents})
df_test["query"] = query

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_test["text"])
query_vectors = vectorizer.transform(df_test["query"])

# Compute cosine similarity
similarities = [cosine_similarity(query_vectors[i], X[i]).flatten()[0] for i in range(len(df_test))]

# Add similarity scores to DataFrame
df_test["similarities"] = similarities

# Assign relevance based on TfIDF cosine similarity (0 to 5)
df_test["relevant"] = pd.cut(df_test["similarities"], bins=6, labels=[0, 1, 2, 3, 4, 5], include_lowest=True)

queries = df_test["query"].unique()

rankers = compute_features(df_test, queries, stop_words, stemmer)

rank_new_query(np.array(rankers).T, final_ranking_model)

array([19.24788252])

Unnamed: 0,loinc_num,long_common_name,component,system,property,query,text,similarities,relevant,map,...,MAP_Weight_Epoch_8,NDCG_Weight_Epoch_8,BM25_Weight_Epoch_8,Precision_Weight_Epoch_8,Recall_Weight_Epoch_8,MAP_Weight_Epoch_9,NDCG_Weight_Epoch_9,BM25_Weight_Epoch_9,Precision_Weight_Epoch_9,Recall_Weight_Epoch_9
0,1988-5,C reactive protein [Mass/volume] in Serum or P...,C reactive protein,Ser/Plas,MCnc,GLUCOSE IN BLOOD,C reactive protein [Mass/volume] in Serum or P...,0.02254,0,0.754592,...,9.993587,8.584457,1.744482,14.941208,10.739157,11.103988,9.538289,1.938313,16.601345,11.9324
