In [215]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import os


# Query Mapping Dictionary
The `query_mapping` dictionary links medical queries to their respective components and systems:

- **"glucose in blood"** → Component: `glucose`, System: `blood`
- **"bilirubin in plasma"** → Component: `bilirubin`, System: `plasma`
- **"white blood cells count"** → Component: `leukocytes`, System: `blood`
- **"calcium in serum"** → Component: `calcium`, System: `serum`

In [216]:
query_mapping = {
    "glucose in blood": {
        "component": "glucose",
        "system": "blood"
    },
    "bilirubin in plasma": {
        "component": "bilirubin",
        "system": "plasma"
    },
    "white blood cells count": {
        "component": "leukocytes",
        "system": "blood"
    },
    "calcium in serum": {
        "component": "calcium",
        "system": "serum"
    }, 
    "cells in urine": {
        "component": "cells",
        "system": "urine"
    }
}

file_path = "./LOINC_Dataset"

# Abbreviation Mapping, Stop Words, and Lemmatization

This script performs **text preprocessing** by:
- Expanding **abbreviations** into full terms using the dictionary `abbreviation_mapping`.
- Removing **common stop words**.
- Applying **lemmatization** to reduce words to their base forms.

In [217]:
abbreviation_mapping = {
    'c': 'component',
    'mcnc': 'mass concentration',
    'bld': 'blood',
    'scnc': 'substance concentration',
    'susc': 'susceptibility',
    'acnc': 'amount concentration',
    'plas': 'plasma',
    'ccnc': 'cell concentration',
    'ncnc': 'number concentration',
    'XXX': 'unknown',
    'bpu': 'blood product unit',
    'fld': 'fluid',
    'abo': 'abo blood group',
    'ser': 'serum',
    'mscnc': 'mass substance concentration',
    'bbl': 'blood product unit',
    'rbc': 'red blood cells', 
    'blda': 'blood group a',
    'bldv': 'blood group v', 
    'tiss': 'tissue',
    'bldco': 'coord blood',
    'csf': 'cerebrospinal fluid'
}

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joseantonioruizheredia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joseantonioruizheredia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Text Preprocessing in a DataFrame

This script performs text cleaning and abbreviation replacement on text columns in a DataFrame (`df`). It helps standardize text data for further analysis by removing noise and ensuring consistency.

## Functions

### 1. clean_text
Cleans a given text string by:
1. Converting it to **lowercase**.
2. Removing **punctuation** (replacing non-alphanumeric characters with spaces).
3. **Tokenizing** (splitting the text into words).
4. Removing **stop words** (common words that don't contribute much meaning).
5. Applying **lemmatization** (reducing words to their base form).

If the input is not a string, it returns an empty string.

### 2. replace_abbreviations
Replaces known abbreviations in a given text using the `abbreviation_mapping` dictionary.
- Splits the text into words.
- Replaces each word if it exists in the abbreviation dictionary.
- Returns the modified text.


In [218]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower() 
        text = re.sub(r'[^\w\s]', ' ', text)  
        words = text.split()  
        words = [word for word in words if word not in stop_words]  
        words = [lemmatizer.lemmatize(word) for word in words] 
        return " ".join(words)
    return ""


def replace_abbreviations(text):
    if isinstance(text, str):
        words = text.split()
        words = [abbreviation_mapping.get(word, word) for word in words]  
        return " ".join(words)
    return text


# Column Weights and Embedding Model Initialization

This script defines **column weights** for a scoring system and initializes an **embedding model** for text similarity calculations.

## Column Weights

The `column_weights` dictionary assigns importance to different columns when calculating scores:

- **Higher weights** (e.g., `name`, `component`) indicate greater importance in the scoring process.
- `loinc_num` has a weight of **0** because it is likely an identifier and does not contribute to similarity calculations.

## Embedding Model Initialization

The script attempts to load an embedding model for text similarity using **SentenceTransformer**. 


In [219]:
column_weights = {
    'name': 1.5,
    'component': 6.0,
    'long_common_name': 1.0,
    'system': 3.0,
    'property': 1.0,
    'measurement_type': 1.0,
    'loinc_num': 0,
    'status': 0.5,
    'example_units': 1.0
}

global embedding_model
if 'embedding_model' not in globals():
    try:
        embedding_model = SentenceTransformer('pritamdeka/BioBERT-MNLI')
    except:
        try:
            embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        except:
            embedding_model = None

# Relevance Score Calculation
This script calculates a **relevance score** for each row in a dataset by comparing a query to the dataset's text fields using both **traditional keyword matching** and **semantic similarity via embeddings**.


### 1. calculate_score
This function calculates the relevance score for a given row by:
1. Splitting the query into words and storing them in a set.
2. Initializing an empty dictionary to track matched words.
3. Computing the **traditional** relevance score.
4. Computing the **embedding-based** relevance score.
5. Combining both scores and optionally printing debug information.


### 2. get_query_embedding
Encodes the query into an **embedding vector** using a pre-trained embedding model.
- If the embedding model is available, it encodes the query.
- If an error occurs, it prints an error message and returns `None`.

### 3. calculate_traditional_score
This function calculates a **keyword matching score** based on:
- The presence of query words in the main columns `component` and `system`
- A predefined weight assigned to each column.

### 4. calculate_embedding_score
This function computes the **semantic similarity score** between the query embedding and the row's text fields:
- Encodes the text field into an embedding.
- Uses **cosine similarity** to measure similarity between the query and field.
- Converts the similarity score (ranging from -1 to 1) into a normalized range.
- Applies column weights to adjust the score.
- Adds the result to the final score.



In [220]:
def calculate_score(query, query_df, row, debug=False):
    query_embedding = get_query_embedding(query)
    debug_info = {"query": query, "embedding_score": [], "traditional_score": []}
    
    traditional_score = calculate_traditional_score(query, row, debug_info)
    embedding_score = calculate_embedding_score(query_embedding, query_df, row, debug_info)
    score = traditional_score + embedding_score
    
    debug_info["final_score"] = score
    
    if debug:
        print(debug_info)
   
    return score


def get_query_embedding(query):
    if embedding_model:
        try:
            return embedding_model.encode(query.lower())
        except Exception as e:
            print(f"Embedding encoding error: {e}")
    return None

def calculate_traditional_score(query, row, debug_info):
    score = 0
    
    query_component = query_mapping[query]["component"].lower()
    query_system = query_mapping[query]["system"].lower()


    component = row.get("COMPONENT", "").lower() 
    system = row.get("SYSTEM", "").lower()  
    
    if query_component == component:
        score += column_weights.get("component", 1.0)  * column_weights.get("component", 1.0) 
    elif query_component in component:
        score += (column_weights.get("component", 1.0) * 0.5) * column_weights.get("component", 1.0)   
    
    if query_system == system:
        score += column_weights.get("system", 1.0)  * column_weights.get("system", 1.0)  
    elif query_system in system:
        score += (column_weights.get("system", 1.0) * 0.5) * column_weights.get("system", 1.0) 

    debug_info["traditional_score"].append({"score": score})
        
    return score

def calculate_embedding_score(query_embedding, query_df, row, debug_info):
    score = 0
    if embedding_model and query_embedding is not None:
        for col in query_df.select_dtypes(include=["object"]).columns:
            if col in row and pd.notna(row[col]):
                cell_text = str(row[col]).lower()
                weight = column_weights.get(col, 1.0)
                try:
                    cell_embedding = embedding_model.encode(cell_text)
                    similarity = cosine_similarity([query_embedding], [cell_embedding])[0][0]
                    embedding_score = ((similarity + 1) / 2) * 5 * weight
                    score += embedding_score
                except Exception as e:
                    print(f"Embedding similarity error: {e}")
        debug_info["embedding_score"].append({"score": score})
    return score


# Preprocessing of the files
The `process_folder` function processes multiple CSV files within a specified folder by cleaning and transforming the data, calculating scores based on predefined queries, and saving the enhanced dataset into a new CSV file. Here's a more detailed summary of its steps:

### 1. Data Preprocessing
   - For each query, it processes the CSV files individually.
   - After successfully reading the file, it standardizes the column names by stripping spaces and converting them to uppercase. Additionally, it renames the `"LONG_COMMON_NAME"` column to `"NAME"` for consistency.
   - A new column, `MEASUREMENT_TYPE`, is added by extracting information from the `NAME` column using regular expressions.
   - The function applies text cleaning to object-type columns (except `"LOINC_NUM"`), including removing unwanted characters and replacing abbreviations for better consistency and readability.

### 2. Score Calculation
   - For each row in the dataset, the function calls `calculate_score` to compute a score based on the query and the row’s attributes. These scores, along with other relevant row data (like LOINC code, component, system, etc.), are appended to a results list.

### 3. Normalization and Deduplication
   - Once all the data is processed, the results are converted into a DataFrame.
   - The scores are normalized to a 0-1 scale. 
   - Duplicate rows grouped by query are removed based on the LOINC code.

### 4. Results
   - The processed DataFrame is saved to the output CSV file. If the file already exists, the results are appended rather than overwriting the existing data.


In [None]:
def process_folder(csv_folder):
    csv_files = [f for f in os.listdir(csv_folder) if f.endswith(".csv")]
    output_filename = "./Dataset_with_scores/dataset_scores_enhanced.csv" 

    all_results = [] 

    for query_name, _ in query_mapping.items():
        print(f"Processing query: {query_name}")

        for csv_file in csv_files:
            file_path = os.path.join(csv_folder, csv_file)
            print(f"Reading file: {file_path}")

            try:
                dataset = pd.read_csv(file_path)
            except Exception as e:
                print(f"Skipping {csv_file} due to read error: {e}")
                continue  
            
            dataset.columns = dataset.columns.str.strip().str.upper()
            dataset.rename(columns={"LONG_COMMON_NAME": "NAME"}, inplace=True)
            dataset["MEASUREMENT_TYPE"] = dataset["NAME"].apply(
                lambda x: re.findall(r"\[(.*?)\]", x)[0] if isinstance(x, str) and "[" in x else ""
            )
            
            for col in dataset.select_dtypes(include=["object"]).columns:
                if col != "LOINC_NUM":  
                    dataset[col] = dataset[col].apply(clean_text)
                    dataset[col] = dataset[col].apply(replace_abbreviations)
            
            for _, row in dataset.iterrows():
                score = calculate_score(query_name, dataset, row, debug=(row.name < 3))
                all_results.append([
                    query_name, row.iloc[0], row.iloc[14], row.iloc[1], 
                    row.iloc[4], row.iloc[2], row.iloc[19], score
                ])

    if all_results:
        results_df = pd.DataFrame(all_results, columns=[
            "Query", "LOINC Code", "Name", "Component", "System", 
            "Property", "Measurement", "Score"
        ])

        min_score, max_score = results_df["Score"].min(), results_df["Score"].max()
        results_df["Normalized_Score"] = results_df["Score"].apply(
            lambda s: (s - min_score) / (max_score - min_score) if max_score != min_score else 1.0
        )
        
        results_df.drop(columns=["Score"], inplace=True)
        
        results_df = results_df.groupby("Query", group_keys=False).apply(
            lambda group: group.drop_duplicates(subset="LOINC Code", keep="first")
        )

        results_df.to_csv(output_filename, mode='a', index=False, header=not os.path.exists(output_filename))
        print(f"Results saved to {output_filename}")

process_folder(file_path)

Processing query: glucose in blood
Reading file: ./LOINC_Dataset/bilirubin_in_plasma.csv
{'query': 'glucose in blood', 'embedding_score': [{'score': np.float32(44.857346)}], 'traditional_score': [{'score': 0}], 'final_score': np.float32(44.857346)}
{'query': 'glucose in blood', 'embedding_score': [{'score': np.float32(43.959023)}], 'traditional_score': [{'score': 0}], 'final_score': np.float32(43.959023)}
{'query': 'glucose in blood', 'embedding_score': [{'score': np.float32(44.26867)}], 'traditional_score': [{'score': 0}], 'final_score': np.float32(44.26867)}
Reading file: ./LOINC_Dataset/cells_in_urine.csv
{'query': 'glucose in blood', 'embedding_score': [{'score': np.float32(42.057995)}], 'traditional_score': [{'score': 0}], 'final_score': np.float32(42.057995)}
{'query': 'glucose in blood', 'embedding_score': [{'score': np.float32(41.484337)}], 'traditional_score': [{'score': 0}], 'final_score': np.float32(41.484337)}
{'query': 'glucose in blood', 'embedding_score': [{'score': np.f