In [115]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import os


query_mapping = {
    "glucose in blood": {
        "component": "glucose",
        "system": "blood"
    },
    "bilirubin in plasma": {
        "component": "bilirubin",
        "system": "plasma"
    },
    "white blood cells count": {
        "component": "leukocytes",
        "system": "blood"
    },
    "calcium in serum": {
        "component": "calcium",
        "system": "serum"
    }
}

file_path = "./LOINC_Dataset"

In [116]:
abbreviation_mapping = {
    'c': 'component',
    'mcnc': 'mass concentration',
    'bld': 'blood',
    'scnc': 'substance concentration',
    'susc': 'susceptibility',
    'acnc': 'amount concentration',
    'plas': 'plasma',
    'ccnc': 'cell concentration',
    'ncnc': 'number concentration',
    'XXX': 'unknown',
    '^bpu': 'body part or unit',
    'fld': 'field',
    'abo': 'abo blood group',
    'ser': 'serum',
    'mscnc': 'mass substance concentration'
}

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joseantonioruizheredia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joseantonioruizheredia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [117]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower() 
        text = re.sub(r'[^\w\s]', ' ', text)  
        words = text.split()  
        words = [word for word in words if word not in stop_words]  
        words = [lemmatizer.lemmatize(word) for word in words] 
        return " ".join(words)
    return ""


def replace_abbreviations(text):
    if isinstance(text, str):
        words = text.split()
        words = [abbreviation_mapping.get(word, word) for word in words]  
        return " ".join(words)
    return text


In [118]:
column_weights = {
    'name': 1.5,
    'component': 6.0,
    'long_common_name': 1.0,
    'system': 3.0,
    'property': 1.0,
    'measurement_type': 1.0,
    'loinc_num': 0,
    'status': 0.5,
    'example_units': 1.0
}

global embedding_model
if 'embedding_model' not in globals():
    try:
        embedding_model = SentenceTransformer('pritamdeka/BioBERT-MNLI')
    except:
        try:
            embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        except:
            embedding_model = None

In [119]:
def calculate_score(query, query_df, row, debug=False):
    query_embedding = get_query_embedding(query)
    debug_info = {"query": query, "embedding_score": [], "traditional_score": []}
    
    traditional_score = calculate_traditional_score(query, row, debug_info)
    embedding_score = calculate_embedding_score(query_embedding, query_df, row, debug_info)
    score = traditional_score + embedding_score
    
    debug_info["final_score"] = score
    
    if debug:
        print(debug_info)
   
    return score


def get_query_embedding(query):
    if embedding_model:
        try:
            return embedding_model.encode(query.lower())
        except Exception as e:
            print(f"Embedding encoding error: {e}")
    return None

def calculate_traditional_score(query, row, debug_info):
    score = 0
    
    query_component = query_mapping[query]["component"].lower()
    query_system = query_mapping[query]["system"].lower()


    component = row.get("COMPONENT", "").lower() 
    system = row.get("SYSTEM", "").lower()  
    
    if query_component == component:
        score += column_weights.get("component", 1.0)  * column_weights.get("component", 1.0) 
    elif query_component in component:
        score += (column_weights.get("component", 1.0) * 0.5) * column_weights.get("component", 1.0)   
    
    if query_system == system:
        score += column_weights.get("system", 1.0)  * column_weights.get("system", 1.0)  
    elif query_system in system:
        score += (column_weights.get("system", 1.0) * 0.5) * column_weights.get("system", 1.0) 

    debug_info["traditional_score"].append({"score": score})
        
    return score

def calculate_embedding_score(query_embedding, query_df, row, debug_info):
    score = 0
    if embedding_model and query_embedding is not None:
        for col in query_df.select_dtypes(include=["object"]).columns:
            if col in row and pd.notna(row[col]):
                cell_text = str(row[col]).lower()
                weight = column_weights.get(col, 1.0)
                try:
                    cell_embedding = embedding_model.encode(cell_text)
                    similarity = cosine_similarity([query_embedding], [cell_embedding])[0][0]
                    embedding_score = ((similarity + 1) / 2) * 5 * weight
                    score += embedding_score
                except Exception as e:
                    print(f"Embedding similarity error: {e}")
        debug_info["embedding_score"].append({"score": score})
    return score


In [None]:
def process_folder(csv_folder):
    csv_files = [f for f in os.listdir(csv_folder) if f.endswith(".csv")]
    output_filename = "results_enhanced.csv" 

    all_results = [] 

    for query_name, _ in query_mapping.items():
        print(f"Processing query: {query_name}")

        for csv_file in csv_files:
            file_path = os.path.join(csv_folder, csv_file)
            print(f"Reading file: {file_path}")

            try:
                dataset = pd.read_csv(file_path)
            except Exception as e:
                print(f"Skipping {csv_file} due to read error: {e}")
                continue  
            
            dataset.columns = dataset.columns.str.strip().str.upper()
            dataset.rename(columns={"LONG_COMMON_NAME": "NAME"}, inplace=True)
            dataset["MEASUREMENT_TYPE"] = dataset["NAME"].apply(
                lambda x: re.findall(r"\[(.*?)\]", x)[0] if isinstance(x, str) and "[" in x else ""
            )
            
            for col in dataset.select_dtypes(include=["object"]).columns:
                if col != "LOINC_NUM":  
                    dataset[col] = dataset[col].apply(clean_text)
                    dataset[col] = dataset[col].apply(replace_abbreviations)
            
            for _, row in dataset.iterrows():
                score = calculate_score(query_name, dataset, row, debug=(row.name < 5))
                all_results.append([
                    query_name, row.iloc[0], row.iloc[14], row.iloc[1], 
                    row.iloc[4], row.iloc[2], row.iloc[19], row.iloc[9], 
                    row.iloc[13], score
                ])

    if all_results:
        results_df = pd.DataFrame(all_results, columns=[
            "Query", "LOINC Code", "Name", "Component", "System", 
            "Property", "Measurement", "Status", "Units", "Score"
        ])

        min_score, max_score = results_df["Score"].min(), results_df["Score"].max()
        results_df["Normalized_Score"] = results_df["Score"].apply(
            lambda s: (s - min_score) / (max_score - min_score) if max_score != min_score else 1.0
        )
        
        results_df.drop(columns=["Score"], inplace=True)
        results_df.drop_duplicates(subset="LOINC Code", keep="first", inplace=True)

        results_df.to_csv(output_filename, mode='a', index=False, header=not os.path.exists(output_filename))
        print(f"Results appended to {output_filename}")

process_folder(file_path)


Processing query: glucose in blood
Reading file: ./LOINC_Dataset/bilirubin_in_plasma.csv
{'query': 'glucose in blood', 'embedding_score': [{'score': np.float32(44.857346)}], 'traditional_score': [{'score': 0}], 'final_score': np.float32(44.857346)}
{'query': 'glucose in blood', 'embedding_score': [{'score': np.float32(43.959023)}], 'traditional_score': [{'score': 0}], 'final_score': np.float32(43.959023)}
{'query': 'glucose in blood', 'embedding_score': [{'score': np.float32(44.26867)}], 'traditional_score': [{'score': 0}], 'final_score': np.float32(44.26867)}
{'query': 'glucose in blood', 'embedding_score': [{'score': np.float32(45.496178)}], 'traditional_score': [{'score': 0}], 'final_score': np.float32(45.496178)}
{'query': 'glucose in blood', 'embedding_score': [{'score': np.float32(45.245346)}], 'traditional_score': [{'score': 0}], 'final_score': np.float32(45.245346)}
Reading file: ./LOINC_Dataset/glucose_in_blood.csv
{'query': 'glucose in blood', 'embedding_score': [{'score': np

KeyboardInterrupt: 