In [19]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load the Excel file
file_path = "C:/Users/bilk7/OneDrive/Bureau/Cours/M1/Information Retrieval/Project3/Lab-MLRankingAssignment/loinc_dataset-v2.xlsx"
# Skip first 2 rows 
df = pd.read_excel(file_path, skiprows=2)

print(df.head())

  loinc_num                                   long_common_name  \
0    1988-5  C reactive protein [Mass/volume] in Serum or P...   
1    1959-6                Bicarbonate [Moles/volume] in Blood   
2   10331-7                                 Rh [Type] in Blood   
3   18998-5     Trimethoprim+Sulfamethoxazole [Susceptibility]   
4    1975-2   Bilirubin.total [Mass/volume] in Serum or Plasma   

                       component    system property  
0             C reactive protein  Ser/Plas     MCnc  
1                    Bicarbonate       Bld     SCnc  
2                             Rh       Bld     Type  
3  Trimethoprim+Sulfamethoxazole   Isolate     Susc  
4                      Bilirubin  Ser/Plas     MCnc  


In [20]:
# Select only relevant columns
df = df[["long_common_name", "component","system","property"]]

df.rename(columns={"long_common_name": "name"}, inplace=True)

# Extract measurement type from brackets [] to create a new column
df["measurement_type"] = df["name"].apply(lambda x: re.findall(r"\[(.*?)\]", x)[0] if "[" in x else "")

# Remove measurement type from name
df["name"] = df["name"].apply(lambda x: re.sub(r"\[.*?\]", "", x).strip() if isinstance(x, str) else x)

# Display first few rows
print(df.head())

                                     name                      component  \
0  C reactive protein  in Serum or Plasma             C reactive protein   
1                   Bicarbonate  in Blood                    Bicarbonate   
2                            Rh  in Blood                             Rh   
3           Trimethoprim+Sulfamethoxazole  Trimethoprim+Sulfamethoxazole   
4     Bilirubin.total  in Serum or Plasma                      Bilirubin   

     system property measurement_type  
0  Ser/Plas     MCnc      Mass/volume  
1       Bld     SCnc     Moles/volume  
2       Bld     Type             Type  
3   Isolate     Susc   Susceptibility  
4  Ser/Plas     MCnc      Mass/volume  


In [21]:
# Abbreviation mapping dictionary
abbreviation_mapping = {
    'c': 'component',
    'mcnc': 'mass concentration',
    'bld': 'blood',
    'scnc': 'substance concentration',
    'susc': 'susceptibility',
    'acnc': 'amount concentration',
    'plas': 'plasma',
    'ccnc': 'cell concentration',
    'ncnc': 'number concentration',
    'XXX': 'unknown',
    '^bpu': 'body part or unit',
    'fld': 'field',
    'abo': 'abo blood group',
    'ser': 'serum',
    'mscnc': 'mass substance concentration'
}

# Download necessary resources
nltk.download("stopwords")
nltk.download("wordnet")

# Initialize tools
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bilk7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bilk7\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
# Standardization and Cleaning
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Lowercase
        text = re.sub(r'[^\w\s]', ' ', text)  # Replace any punctuation with space
        words = text.split()  # Tokenize
        words = [word for word in words if word not in stop_words]  # Remove stopwords
        words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
        return " ".join(words)
    return ""

# Function to replace abbreviations
def replace_abbreviations(text):
    if isinstance(text, str):
        words = text.split()
        words = [abbreviation_mapping.get(word, word) for word in words]  # Replace if in dictionary
        return " ".join(words)
    return text

# Apply cleaning to text columns
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].apply(clean_text)
    df[col] = df[col].apply(replace_abbreviations)

print(df.head(10))

                                       name                      component  \
0   component reactive protein serum plasma     component reactive protein   
1                         bicarbonate blood                    bicarbonate   
2                                  rh blood                             rh   
3             trimethoprim sulfamethoxazole  trimethoprim sulfamethoxazole   
4              bilirubin total serum plasma                      bilirubin   
5  blood group antibody screen serum plasma    blood group antibody screen   
6                carbon dioxide total blood                 carbon dioxide   
7                             ciprofloxacin                  ciprofloxacin   
8                     cortisol serum plasma                       cortisol   
9                     chloride serum plasma                       chloride   

         system                 property measurement_type  
0  serum plasma       mass concentration      mass volume  
1         blood  subs