In [67]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

print(df.head())

  loinc_num                                     name  \
0    1988-5  component reactive protein serum plasma   
1    1959-6                        bicarbonate blood   
2   10331-7                                 rh blood   
3   18998-5            trimethoprim sulfamethoxazole   
4    1975-2             bilirubin total serum plasma   

                       component        system                 property  \
0     component reactive protein  serum plasma       mass concentration   
1                    bicarbonate         blood  substance concentration   
2                             rh         blood                     type   
3  trimethoprim sulfamethoxazole       isolate           susceptibility   
4                      bilirubin  serum plasma       mass concentration   

  measurement_type  
0                   
1                   
2                   
3                   
4                   


# Data Cleaning: Renaming and Extracting Measurement Type

This script performs two data cleaning operations:

1. **Renaming Columns**  
2. **Extracting and Removing Measurement Types from Names**


In [68]:
df.rename(columns={"long_common_name": "name"}, inplace=True)

df["measurement_type"] = df["name"].apply(lambda x: re.findall(r"\[(.*?)\]", x)[0] if "[" in x else "")
df["name"] = df["name"].apply(lambda x: re.sub(r"\[.*?\]", "", x).strip() if isinstance(x, str) else x)

print(df.head())

  loinc_num                                     name  \
0    1988-5  component reactive protein serum plasma   
1    1959-6                        bicarbonate blood   
2   10331-7                                 rh blood   
3   18998-5            trimethoprim sulfamethoxazole   
4    1975-2             bilirubin total serum plasma   

                       component        system                 property  \
0     component reactive protein  serum plasma       mass concentration   
1                    bicarbonate         blood  substance concentration   
2                             rh         blood                     type   
3  trimethoprim sulfamethoxazole       isolate           susceptibility   
4                      bilirubin  serum plasma       mass concentration   

  measurement_type  
0                   
1                   
2                   
3                   
4                   


# Abbreviation Mapping, Stop Words, and Lemmatization

This script performs **text preprocessing** by:
- Expanding **abbreviations** into full terms using the dictionary `abbreviation_mapping`.
- Removing **common stop words**.
- Applying **lemmatization** to reduce words to their base forms.

In [69]:
abbreviation_mapping = {
    'c': 'component',
    'mcnc': 'mass concentration',
    'bld': 'blood',
    'scnc': 'substance concentration',
    'susc': 'susceptibility',
    'acnc': 'amount concentration',
    'plas': 'plasma',
    'ccnc': 'cell concentration',
    'ncnc': 'number concentration',
    'XXX': 'unknown',
    '^bpu': 'body part or unit',
    'fld': 'field',
    'abo': 'abo blood group',
    'ser': 'serum',
    'mscnc': 'mass substance concentration'
}

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joseantonioruizheredia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joseantonioruizheredia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Text Preprocessing in a DataFrame

This script performs text cleaning and abbreviation replacement on text columns in a DataFrame (`df`). It helps standardize text data for further analysis by removing noise and ensuring consistency.

## Functions

### 1. clean_text
Cleans a given text string by:
1. Converting it to **lowercase**.
2. Removing **punctuation** (replacing non-alphanumeric characters with spaces).
3. **Tokenizing** (splitting the text into words).
4. Removing **stop words** (common words that don't contribute much meaning).
5. Applying **lemmatization** (reducing words to their base form).

If the input is not a string, it returns an empty string.

### 2. replace_abbreviations
Replaces known abbreviations in a given text using the `abbreviation_mapping` dictionary.
- Splits the text into words.
- Replaces each word if it exists in the abbreviation dictionary.
- Returns the modified text.


In [70]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower() 
        text = re.sub(r'[^\w\s]', ' ', text)  
        words = text.split()  
        words = [word for word in words if word not in stop_words]  
        words = [lemmatizer.lemmatize(word) for word in words] 
        return " ".join(words)
    return ""


def replace_abbreviations(text):
    if isinstance(text, str):
        words = text.split()
        words = [abbreviation_mapping.get(word, word) for word in words]  
        return " ".join(words)
    return text


for col in df.select_dtypes(include=["object"]).columns:
    if col != "loinc_num":  
        df[col] = df[col].apply(clean_text)
        df[col] = df[col].apply(replace_abbreviations)

print(df.head(10))

  loinc_num                                      name  \
0    1988-5   component reactive protein serum plasma   
1    1959-6                         bicarbonate blood   
2   10331-7                                  rh blood   
3   18998-5             trimethoprim sulfamethoxazole   
4    1975-2              bilirubin total serum plasma   
5     890-4  blood group antibody screen serum plasma   
6   20565-8                carbon dioxide total blood   
7   18906-8                             ciprofloxacin   
8    2143-6                     cortisol serum plasma   
9    2075-0                     chloride serum plasma   

                       component        system                 property  \
0     component reactive protein  serum plasma       mass concentration   
1                    bicarbonate         blood  substance concentration   
2                             rh         blood                     type   
3  trimethoprim sulfamethoxazole       isolate           susceptibility 

# Column Weights and Embedding Model Initialization

This script defines **column weights** for a scoring system and initializes an **embedding model** for text similarity calculations.

## Column Weights

The `column_weights` dictionary assigns importance to different columns when calculating scores:

- **Higher weights** (e.g., `name`, `component`) indicate greater importance in the scoring process.
- `loinc_num` has a weight of **0** because it is likely an identifier and does not contribute to similarity calculations.

## Embedding Model Initialization

The script attempts to load an embedding model for text similarity using **SentenceTransformer**. 


In [71]:
column_weights = {
    'name': 1.5,
    'component': 4.0,
    'long_common_name': 1.0,
    'system': 3.0,
    'property': 1.0,
    'measurement_type': 1.0,
    'loinc_num': 0
}

global embedding_model
if 'embedding_model' not in globals():
    try:
        embedding_model = SentenceTransformer('pritamdeka/BioBERT-MNLI')
    except:
        try:
            embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        except:
            embedding_model = None

# Relevance Score Calculation
This script calculates a **relevance score** for each row in a dataset by comparing a query to the dataset's text fields using both **traditional keyword matching** and **semantic similarity via embeddings**.


### 1. calculate_score
This function calculates the relevance score for a given row by:
1. Splitting the query into words and storing them in a set.
2. Initializing an empty dictionary to track matched words.
3. Computing the **traditional** relevance score.
4. Computing the **embedding-based** relevance score.
5. Combining both scores and optionally printing debug information.


### 2. get_query_embedding
Encodes the query into an **embedding vector** using a pre-trained embedding model.
- If the embedding model is available, it encodes the query.
- If an error occurs, it prints an error message and returns `None`.

### 3. calculate_traditional_score
This function calculates a **keyword matching score** based on:
- The presence of query words in text fields of the row.
- A predefined weight assigned to each column.
- Only the highest weight for each word is considered.

### 4. calculate_embedding_score
This function computes the **semantic similarity score** between the query embedding and the row's text fields:
- Encodes the text field into an embedding.
- Uses **cosine similarity** to measure similarity between the query and field.
- Converts the similarity score (ranging from -1 to 1) into a normalized range (0 to 5).
- Applies column weights to adjust the score.
- Adds the result to the final score.

## Execution Process
1. The script calculates the **relevance score** for each row using the `apply` function.
2. For the first five rows, debugging is enabled to print detailed matching information.
3. The relevance scores are merged into `df_original` based on `loinc_num`.
4. The final results are saved to a CSV file and printed.



In [None]:
def calculate_score(query, row, debug=False):
    global min_score, max_score
    
    query_words = set(query.lower().split())  # 
    matched_words = {}  

    query_embedding = get_query_embedding(query)
    debug_info = {"query": query, "matched_words": {}, "embedding_scores": []}

    score = calculate_traditional_score(query_words, row, matched_words, debug_info)
    score += calculate_embedding_score(query_embedding, row, debug_info)

    debug_info["matched_words"] = matched_words
    debug_info["final_score"] = score
    
    if debug:
        print(debug_info)
    
    return score

def get_query_embedding(query):
    if embedding_model:
        try:
            return embedding_model.encode(query.lower())
        except Exception as e:
            print(f"Embedding encoding error: {e}")
    return None

def calculate_traditional_score(query_words, row, matched_words, debug_info):
    score = 0
    for col in df.select_dtypes(include=["object"]).columns:
        if col in row and pd.notna(row[col]):
            cell_text = str(row[col]).lower()
            cell_words = set(cell_text.split())
            weight = column_weights.get(col, 1.0)

            new_matched_words = query_words & cell_words
            for word in new_matched_words:
                if word not in matched_words or weight > matched_words[word]:
                    matched_words[word] = weight  
    return score

def calculate_embedding_score(query_embedding, row, debug_info):
    score = 0
    if embedding_model and query_embedding is not None:
        for col in df.select_dtypes(include=["object"]).columns:
            if col in row and pd.notna(row[col]):
                cell_text = str(row[col]).lower()
                weight = column_weights.get(col, 1.0)
                try:
                    cell_embedding = embedding_model.encode(cell_text)
                    similarity = cosine_similarity([query_embedding], [cell_embedding])[0][0]
                    embedding_score = ((similarity + 1) / 2) * 5 * weight
                    score += embedding_score
                    debug_info["embedding_scores"].append({"column": col, "similarity": similarity, "score": embedding_score})
                except Exception as e:
                    print(f"Embedding similarity error: {e}")
    return score


In [None]:
def preprocess(excel_file):
    """
    Function to preprocess the Excel file and compute relevance scores.
    """
    
    results = []
    xl = pd.ExcelFile(excel_file)

    for sheet_name in xl.sheet_names:
        query_df = xl.parse(sheet_name, header=2)
        sheet_name = sheet_name.lower() 
        
        query_df.columns = query_df.columns.str.strip()  # Remove extra spaces from column names
        
        if len(query_df.columns) < 4:
            print(f"Skipping sheet '{sheet_name}': Missing required columns")
            continue
        
        for index, row in query_df.iterrows():
            score = calculate_score(sheet_name, row, debug=True if row.name < 5 else False)
            results.append([sheet_name, row.iloc[0], row.iloc[1], score])

    results_df = pd.DataFrame(results, columns=["Query", "LOINC Code", "Name", "Score"])

    min_score = results_df["Score"].min()
    max_score = results_df["Score"].max()

    results_df["Normalized_Score"] = results_df["Score"].apply(lambda score: (score - min_score) / (max_score - min_score) if max_score != min_score else 1.0)

    results_df.to_csv("dataset_with_scores.csv", index=False)
    print("CSV with relevance scores has been saved as 'dataset_with_scores.csv'.")



file_path = "./loinc_dataset-v2.xlsx"
results_df = preprocess(file_path)

{'query': 'glucose in blood', 'matched_words': {}, 'embedding_scores': [{'column': 'loinc_num', 'similarity': np.float32(0.082764), 'score': np.float32(0.0)}, {'column': 'component', 'similarity': np.float32(0.0109481625), 'score': np.float32(10.109482)}, {'column': 'system', 'similarity': np.float32(0.026590478), 'score': np.float32(7.6994286)}, {'column': 'property', 'similarity': np.float32(-0.10000585), 'score': np.float32(2.2499852)}], 'final_score': np.float32(20.058895)}
{'query': 'glucose in blood', 'matched_words': {}, 'embedding_scores': [{'column': 'loinc_num', 'similarity': np.float32(0.1021516), 'score': np.float32(0.0)}, {'column': 'component', 'similarity': np.float32(0.14276442), 'score': np.float32(11.427645)}, {'column': 'system', 'similarity': np.float32(-0.00083919987), 'score': np.float32(7.4937067)}, {'column': 'property', 'similarity': np.float32(-0.043209463), 'score': np.float32(2.3919764)}], 'final_score': np.float32(21.313328)}
{'query': 'glucose in blood', '