# 🧬 02 - Ontology Mapping of Intervention Names

In [2]:
import pandas as pd
from pathlib import Path
from fuzzywuzzy import process

BASE_DIR = Path.cwd()
DATA_PATH = BASE_DIR / "data" / "clinical_trials_cleaned.csv"

if not DATA_PATH.exists():  
    raise FileNotFoundError(f"Data file not found at {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
df.head()



Unnamed: 0,study_id,intervention_name,condition,sponsor,start_date,end_date,phase,status,completeness_score,study_duration_days
0,NCT100000,Aspirin,Covid-19,Sanofi,2022-09-20,2025-04-07,Phase 2,Withdrawn,1.0,930.0
1,NCT100001,Trastuzumab,Alzheimer'S Disease,Pfizer,2023-07-01,2024-09-05,,Terminated,0.875,432.0
2,NCT100002,Donepezil,Hypertension,Pfizer,2021-08-17,2025-05-24,Phase 1,Completed,1.0,1376.0
3,NCT100003,Remdesivir,Breast Cancer,Sanofi,2022-04-15,2025-05-20,Phase 1,Terminated,1.0,1131.0
4,NCT100004,Aspirin,Breast Cancer,Pfizer,2023-09-10,2024-08-28,Phase 4,Completed,1.0,353.0


## 📚 Define Standard Vocabulary

In [3]:
# I'd load this from a standard source like DrugBank
vocabulary = ['Metformin', 'Aspirin', 'Trastuzumab', 'Donepezil', 'Salbutamol', 'Remdesivir']

## 🤖 Fuzzy Matching Function

In [5]:
def fuzzy_map(term, vocab_list):
    if pd.isna(term) or term.strip() == '': 
        return pd.Series([None, 0, 'missing'])
    match, score = process.extractOne(term, vocab_list)
    status = 'mapped' if score >= 90 else 'low_score'
    return pd.Series([match, score, status])

## 🔄 Apply Mapping to Intervention Names

In [6]:
df[['mapped_term', 'match_score', 'mapping_status']] = df['intervention_name'].apply(lambda x: fuzzy_map(str(x), vocabulary))
df[['intervention_name', 'mapped_term', 'match_score', 'mapping_status']].head(10)

Unnamed: 0,intervention_name,mapped_term,match_score,mapping_status
0,Aspirin,Aspirin,100,mapped
1,Trastuzumab,Trastuzumab,100,mapped
2,Donepezil,Donepezil,100,mapped
3,Remdesivir,Remdesivir,100,mapped
4,Aspirin,Aspirin,100,mapped
5,Donepezil,Donepezil,100,mapped
6,Salbutamol,Salbutamol,100,mapped
7,Metformin,Metformin,100,mapped
8,Remdesivir,Remdesivir,100,mapped
9,Salbutamol,Salbutamol,100,mapped


## 💾 Save Mapping Results

In [8]:
df.to_csv(DATA_PATH.parent / "mapping_results.csv", index=False)

print("Mapping results saved to outputs/mapping_results.csv")

Mapping results saved to outputs/mapping_results.csv
