In [73]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load the Excel file
file_path = "C:/Users/bilk7/OneDrive/Bureau/Cours/M1/Information Retrieval/Project3/Lab-MLRankingAssignment/loinc_dataset-v2.xlsx"
# Skip first 2 rows 
df_original = pd.read_excel(file_path, skiprows=2)
df = df_original.copy()
query="GLUCOSE IN BLOOD"

print(df.head())

  loinc_num                                   long_common_name  \
0    1988-5  C reactive protein [Mass/volume] in Serum or P...   
1    1959-6                Bicarbonate [Moles/volume] in Blood   
2   10331-7                                 Rh [Type] in Blood   
3   18998-5     Trimethoprim+Sulfamethoxazole [Susceptibility]   
4    1975-2   Bilirubin.total [Mass/volume] in Serum or Plasma   

                       component    system property  
0             C reactive protein  Ser/Plas     MCnc  
1                    Bicarbonate       Bld     SCnc  
2                             Rh       Bld     Type  
3  Trimethoprim+Sulfamethoxazole   Isolate     Susc  
4                      Bilirubin  Ser/Plas     MCnc  


In [74]:
df.rename(columns={"long_common_name": "name"}, inplace=True)

# Extract measurement type from brackets [] to create a new column
df["measurement_type"] = df["name"].apply(lambda x: re.findall(r"\[(.*?)\]", x)[0] if "[" in x else "")

# Remove measurement type from name
df["name"] = df["name"].apply(lambda x: re.sub(r"\[.*?\]", "", x).strip() if isinstance(x, str) else x)

# Display first few rows
print(df.head())

  loinc_num                                    name  \
0    1988-5  C reactive protein  in Serum or Plasma   
1    1959-6                   Bicarbonate  in Blood   
2   10331-7                            Rh  in Blood   
3   18998-5           Trimethoprim+Sulfamethoxazole   
4    1975-2     Bilirubin.total  in Serum or Plasma   

                       component    system property measurement_type  
0             C reactive protein  Ser/Plas     MCnc      Mass/volume  
1                    Bicarbonate       Bld     SCnc     Moles/volume  
2                             Rh       Bld     Type             Type  
3  Trimethoprim+Sulfamethoxazole   Isolate     Susc   Susceptibility  
4                      Bilirubin  Ser/Plas     MCnc      Mass/volume  


In [75]:
# Abbreviation mapping dictionary
abbreviation_mapping = {
    'c': 'component',
    'mcnc': 'mass concentration',
    'bld': 'blood',
    'scnc': 'substance concentration',
    'susc': 'susceptibility',
    'acnc': 'amount concentration',
    'plas': 'plasma',
    'ccnc': 'cell concentration',
    'ncnc': 'number concentration',
    'XXX': 'unknown',
    '^bpu': 'body part or unit',
    'fld': 'field',
    'abo': 'abo blood group',
    'ser': 'serum',
    'mscnc': 'mass substance concentration'
}

# Download necessary resources
nltk.download("stopwords")
nltk.download("wordnet")

# Initialize tools
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bilk7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bilk7\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [76]:
# Standardization and Cleaning
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Lowercase
        text = re.sub(r'[^\w\s]', ' ', text)  # Replace any punctuation with space
        words = text.split()  # Tokenize
        words = [word for word in words if word not in stop_words]  # Remove stopwords
        words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
        return " ".join(words)
    return ""

# Function to replace abbreviations
def replace_abbreviations(text):
    if isinstance(text, str):
        words = text.split()
        words = [abbreviation_mapping.get(word, word) for word in words]  # Replace if in dictionary
        return " ".join(words)
    return text

# Apply cleaning to text columns
for col in df.select_dtypes(include=["object"]).columns:
    if col != "loinc_num":  # Skip the first column
        df[col] = df[col].apply(clean_text)
        df[col] = df[col].apply(replace_abbreviations)

print(df.head(10))

  loinc_num                                      name  \
0    1988-5   component reactive protein serum plasma   
1    1959-6                         bicarbonate blood   
2   10331-7                                  rh blood   
3   18998-5             trimethoprim sulfamethoxazole   
4    1975-2              bilirubin total serum plasma   
5     890-4  blood group antibody screen serum plasma   
6   20565-8                carbon dioxide total blood   
7   18906-8                             ciprofloxacin   
8    2143-6                     cortisol serum plasma   
9    2075-0                     chloride serum plasma   

                       component        system                 property  \
0     component reactive protein  serum plasma       mass concentration   
1                    bicarbonate         blood  substance concentration   
2                             rh         blood                     type   
3  trimethoprim sulfamethoxazole       isolate           susceptibility 

In [77]:
# Define column weights
column_weights = {
    'name': 2.0,
    'component': 1.8,
    'long_common_name': 1.5,
    'system': 1.2,
    'property': 1.2,
    'measurement_type': 1.0,
    'loinc_num': 0
}

# Load pre-trained model - this only happens once when first called
# Uses global variable to avoid reloading for every row
global embedding_model
if 'embedding_model' not in globals():
    # Choose a biomedical domain-specific model if possible
    try:
        # First try a biomedical-specific model
        embedding_model = SentenceTransformer('pritamdeka/BioBERT-MNLI')
    except:
        try:
            # Fall back to general-purpose model
            embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        except:
            # If neither works, set to None
            embedding_model = None

In [78]:
# Function to calculate relevance score using pre-trained embeddings
def calculate_score(query, row, debug=False):
    query_words = set(query.lower().split())  # Normalize and tokenize query
    matched_words = {}  # Dictionary to track best weight for each matched word and avoid double-counting

    query_embedding = get_query_embedding(query)
    debug_info = {"query": query, "matched_words": {}, "embedding_scores": []}

    score = calculate_traditional_score(query_words, row, matched_words, debug_info)
    score += calculate_embedding_score(query_embedding, row, debug_info)

    debug_info["matched_words"] = matched_words
    debug_info["final_score"] = score

    if debug:
        print(debug_info)
    
    return score

def get_query_embedding(query):
    if embedding_model:
        try:
            return embedding_model.encode(query.lower())
        except Exception as e:
            print(f"Embedding encoding error: {e}")
    return None

def calculate_traditional_score(query_words, row, matched_words, debug_info):
    score = 0
    for col in df.select_dtypes(include=["object"]).columns:
        if col in row and pd.notna(row[col]):
            cell_text = str(row[col]).lower()
            cell_words = set(cell_text.split())
            weight = column_weights.get(col, 1.0)

            # Traditional exact/partial matching (30% of score)
            new_matched_words = query_words & cell_words
            for word in new_matched_words:
                if word not in matched_words or weight > matched_words[word]:
                    matched_words[word] = weight  # Keep the highest weight
    return score

def calculate_embedding_score(query_embedding, row, debug_info):
    score = 0
    if embedding_model and query_embedding is not None:
        for col in df.select_dtypes(include=["object"]).columns:
            if col in row and pd.notna(row[col]):
                cell_text = str(row[col]).lower()
                weight = column_weights.get(col, 1.0)
                try:
                    cell_embedding = embedding_model.encode(cell_text)
                    similarity = cosine_similarity([query_embedding], [cell_embedding])[0][0]
                    embedding_score = ((similarity + 1) / 2) * 5 * weight
                    score += embedding_score
                    debug_info["embedding_scores"].append({"column": col, "similarity": similarity, "score": embedding_score})
                except Exception as e:
                    print(f"Embedding similarity error: {e}")
    return score

# Compute relevance scores on df with debugging enabled for the first 20 rows
df["relevance_score"] = df.apply(lambda row: calculate_score(query, row, debug=True if row.name < 5 else False), axis=1)

# Merge scores into df_original based on loinc_num
df_original = df_original.merge(df[["loinc_num", "relevance_score"]], on="loinc_num", how="left")

# Save results
df_original.to_csv("glucose_in_blood_relevance_scores.csv", index=False)

print("CSV with relevance scores has been saved as '" + query + "_relevance_scores.csv'.")
print(df_original.head())

{'query': 'GLUCOSE IN BLOOD', 'matched_words': {}, 'embedding_scores': [{'column': 'loinc_num', 'similarity': np.float32(0.082764074), 'score': np.float32(0.0)}, {'column': 'name', 'similarity': np.float32(0.1637841), 'score': np.float32(5.8189206)}, {'column': 'component', 'similarity': np.float32(-0.0010643341), 'score': np.float32(4.49521)}, {'column': 'system', 'similarity': np.float32(0.42186362), 'score': np.float32(4.2655907)}, {'column': 'property', 'similarity': np.float32(0.16001825), 'score': np.float32(3.4800546)}, {'column': 'measurement_type', 'similarity': np.float32(0.14504647), 'score': np.float32(2.862616)}], 'final_score': np.float32(20.922392)}
{'query': 'GLUCOSE IN BLOOD', 'matched_words': {'blood': 2.0}, 'embedding_scores': [{'column': 'loinc_num', 'similarity': np.float32(0.10215171), 'score': np.float32(0.0)}, {'column': 'name', 'similarity': np.float32(0.35837346), 'score': np.float32(6.7918673)}, {'column': 'component', 'similarity': np.float32(0.14276451), 's