In [153]:
# Libs
import pandas as pd
import random
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch

In [154]:
# General
SEED = 42
random.seed(SEED)

# Loading Data

In [155]:
df_dw = pd.read_feather("../Data/DW/df_dw_filtered.feather")
print(df_dw.head())

                                         Sequela Name  \
0   HIV/AIDS -  Drug-susceptible Tuberculosis with...   
12                     Symptomatic HIV without anemia   
13                                AIDS without anemia   
14                           Early HIV without anemia   
24  HIV/AIDS with antiretroviral treatment without...   

                                    Health State Name  \
0                          Tuberculosis, HIV infected   
12                   HIV cases, symptomatic, pre-AIDS   
13            AIDS cases, not receiving ARV treatment   
14  Generic uncomplicated disease: anxiety about d...   
24            HIV/AIDS cases, receiving ARV treatment   

                         Health State Lay Description      Mean     Lower  \
0   has a persistent cough and fever, shortness of...  0.408459  0.273579   
12  has weight loss, fatigue, and frequent infecti...  0.274479  0.184359   
13  has severe weight loss, weakness, fatigue, cou...  0.581590  0.405544   
14  ha

In [156]:
df_dta = pd.read_feather('../Data/Cleaned/df_dta_cleaned.feather')
print(df_dta.head())

   ID_dta        fDISEASE fDIMENSION fDOMAIN          fCRITERION fINDICATOR  \
0       1  Crohns Disease   Patients  Health  Impact on autonomy   Mobility   
1       2  Crohns Disease   Patients  Health  Impact on autonomy   Mobility   
2       3  Crohns Disease   Patients  Health  Impact on autonomy   Mobility   
3       4  Crohns Disease   Patients  Health  Impact on autonomy  Self-care   
4       5  Crohns Disease   Patients  Health  Impact on autonomy  Self-care   

  fDISEASE_SUBTYPE fDISEASE_PROXY fCOUNTRY_PROXY    YEAR  ... VALUE_char  SD  \
0              NaN            NaN            NaN  2023.0  ...       None NaN   
1              NaN            NaN            NaN  2023.0  ...       None NaN   
2              NaN            NaN            NaN  2023.0  ...       None NaN   
3              NaN            NaN            NaN  2023.0  ...       None NaN   
4              NaN            NaN            NaN  2023.0  ...       None NaN   

  LOWER CI UPPER CI       SOURCE KEY_POINT  

# Transforming Data

In [157]:
df_dta['fINDICATOR'] = df_dta['fINDICATOR'].replace(
    'Experienced burden of psychological symptoms related to the health issue',
    'Experienced burden of psychological symptoms'
)

df_dta_disabilities = df_dta[
    df_dta['fINDICATOR'].isin([
        'Experienced burden of physical symptoms',
        'Experienced burden of psychological symptoms'
    ])
]

  df_dta['fINDICATOR'] = df_dta['fINDICATOR'].replace(


In [193]:
print(len(df_dta_disabilities['CATEGORIES']))

50


In [158]:
print(df_dta_disabilities['CATEGORIES'])

21                                             Anal pain
22                     Blood loss during bowel movements
23                                          Constipation
24                             Continuous abdominal pain
25                                                Cramps
26                                             Diarrhoea
27                               Difficult wound healing
28                    Dry, painful or itchy skin or rash
29                         Elevated temperature or fever
30                                 Fatigue or exhaustion
31                                              Headache
32                            Joint pain or inflammation
33                                        Lack of energy
34                                      Loss of appetite
35                                    Nausea or vomiting
36         Pain in the mouth due to, for example, ulcers
37                                 Urgency or stool urge
38                           Vi

# Disability Matching

In [159]:
model = SentenceTransformer('all-mpnet-base-v2')

## Testing

In [160]:
input_text = "fever"

# Your disability labels (simplified example, expand with your full list)
disability_labels = [
    "Elevated temperature or fever"
]

# Encode
input_embedding = model.encode(input_text, convert_to_tensor=True)
label_embeddings = model.encode(disability_labels, convert_to_tensor=True)

# Compute cosine similarity
cosine_scores = util.cos_sim(input_embedding, label_embeddings)[0]

# Display top matches
results = sorted(zip(disability_labels, cosine_scores.tolist()), key=lambda x: x[1], reverse=True)

print("Top matches:")
for label, score in results[:5]:
    print(f"{label}: {score:.3f}")

Top matches:
Elevated temperature or fever: 0.660


In [161]:
categories = df_dta_disabilities['CATEGORIES'].unique()
texts = df_dw['Health State Lay Description'].tolist()
text_embeddings = model.encode(texts, convert_to_tensor=True)

# Loop through each category
for category in categories:
    # Encode the category
    category_embedding = model.encode(category, convert_to_tensor=True)
    
    # Compute cosine similarities
    cosine_scores = util.cos_sim(category_embedding, text_embeddings)[0]
    best_idx = cosine_scores.argmax().item()
    
    print(f"\nCategory: {category}")
    print(f"Best Match: {texts[best_idx]}")
    print(f"Similarity Score: {cosine_scores[best_idx].item():.4f}")



Category: Anal pain
Best Match: has cramping pain and a bloated feeling in the belly.
Similarity Score: 0.3867

Category: Blood loss during bowel movements
Best Match: has diarrhea three or more times a day, with painful cramps in the belly and feeling thirsty
Similarity Score: 0.4177

Category: Constipation
Best Match: cannot control urinating.
Similarity Score: 0.5527

Category: Continuous abdominal pain
Best Match: has cramping pain and a bloated feeling in the belly.
Similarity Score: 0.5483

Category: Cramps
Best Match: has cramping pains in the legs after walking a medium distance. the pain goes away after a short rest.
Similarity Score: 0.3840

Category: Diarrhoea
Best Match: vomits blood and feels nauseous.
Similarity Score: 0.6610

Category: Difficult wound healing
Best Match: has swollen legs with hard and thick skin, which causes difficulty in moving around.
Similarity Score: 0.3464

Category: Dry, painful or itchy skin or rash
Best Match: has a blistering skin rash that ca

In [162]:
split_symptoms_all = []
original_indices = []
for idx, symptoms_list in enumerate(df_dw['Split Symptoms']):
    for symptom in symptoms_list:
        split_symptoms_all.append(symptom)
        original_indices.append(idx)

# Encode all split symptoms once
split_symptom_embeddings = model.encode(split_symptoms_all, convert_to_tensor=True)

categories = df_dta_disabilities['CATEGORIES'].unique()
texts = df_dw['Health State Lay Description'].tolist()

for category in categories:
    # Encode the category
    category_embedding = model.encode(category, convert_to_tensor=True)
    
    # Compute cosine similarities between category and all split symptoms
    cosine_scores = util.cos_sim(category_embedding, split_symptom_embeddings)[0]
    best_idx = cosine_scores.argmax().item()
    
    matched_symptom = split_symptoms_all[best_idx]
    original_idx = original_indices[best_idx]
    original_text = texts[original_idx]
    similarity = cosine_scores[best_idx].item()
    
    print(f"\n🔹 Category: {category}")
    print(f"   Best Matching Symptom: {matched_symptom}")
    print(f"   From Original Description: {original_text}")
    print(f"   Similarity Score: {similarity:.4f}")



🔹 Category: Anal pain
   Best Matching Symptom: pain
   From Original Description: has a high fever and pain, and feels very weak, which causes great difficulty with daily activities.
   Similarity Score: 0.6221

🔹 Category: Blood loss during bowel movements
   Best Matching Symptom: loss of urine or bowel control
   From Original Description: has sudden seizures one or more times each month, with violent muscle contractions and stiffness, loss of consciousness, and loss of urine or bowel control. between seizures the person has memory loss and difficulty concentrating.
   Similarity Score: 0.6445

🔹 Category: Constipation
   Best Matching Symptom: bowel control
   From Original Description: is paralyzed from the waist down, cannot feel or move the legs and has difficulties with urine and bowel control. the person uses a wheelchair to move around.
   Similarity Score: 0.6101

🔹 Category: Continuous abdominal pain
   Best Matching Symptom: has cramping abdominal pain
   From Original D

## Map all disabilities to rows in the GBD data

### Based on split symptoms

In [194]:
SIMILARITY_TRESHOLD = 0.7

In [195]:
matched_ids_per_row = []
matched_cat_per_row = []

categories = df_dta_disabilities['CATEGORIES'].tolist()
category_ids = df_dta_disabilities['ID_dta'].tolist()
category_embeddings = model.encode(categories, convert_to_tensor=True)

for idx, symptoms_list in enumerate(df_dw['Split Symptoms']):
    if len(symptoms_list) == 0:
        matched_ids_per_row.append([])
        continue
    
    symptom_embeddings = model.encode(symptoms_list, convert_to_tensor=True)
    matched_ids = set()
    cosine_scores = util.cos_sim(category_embeddings, symptom_embeddings)
    
    for cat_idx, cat_id in enumerate(category_ids):
        if (cosine_scores[cat_idx] >= SIMILARITY_TRESHOLD).any().item():
            matched_ids.add(cat_id)
    
    matched_ids_per_row.append(list(matched_ids))
    matched_cat_per_row.append([id_to_category[cat_id] for cat_id in matched_ids])

df_dw['Matching Category IDs'] = matched_ids_per_row
df_dw['Matching Categories'] = matched_cat_per_row

In [196]:
# Transform the outcome data
df_dw_filtered = df_dw[df_dw['Matching Category IDs'].apply(lambda x: len(x) > 0)].copy()

len(df_dw_filtered)

36

In [197]:
pd.set_option('display.max_colwidth', None)
print(df_dw_filtered[['Health State Lay Description', 'Mean', 'Matching Categories']].head())
pd.set_option('display.max_colwidth', 50)

                                                                                                  Health State Lay Description  \
0            has a persistent cough and fever, shortness of breath, night sweats, weakness and fatigue and severe weight loss.   
12                                                                          has weight loss, fatigue, and frequent infections.   
13             has severe weight loss, weakness, fatigue, cough and fever, and frequent infections, skin rashes and diarrhea.    
74                        has a high fever and pain, and feels very weak, which causes great difficulty with daily activities.   
91  has diarrhea three or more times a day with severe belly cramps. the person is very thirsty and feels nauseous and tired.    

        Mean  \
0   0.408459   
12  0.274479   
13  0.581590   
74  0.133412   
91  0.247096   

                                                                                                       Matching Categories

# Testing Mapping Accuracy

In [198]:
i = random.randint(0, len(df_dw_filtered) - 1)

row = df_dw_filtered.iloc[i]
description = row['Health State Lay Description']
matched_ids = row['Matching Category IDs']
    
# Get category names for matched IDs
matched_categories = df_dta_disabilities.loc[
    df_dta_disabilities['ID_dta'].isin(matched_ids), 'CATEGORIES'].tolist()
    
print(f"\nRow {i}:")
print(f"Description: {description}")
print(f"Matched Categories: {matched_categories}")


Row 17:
Description: has chest pain that occurs with minimal physical activity, such as walking only a short distance. after a brief rest, the pain goes away. the person avoids most physical activities because of the pain.
Matched Categories: ['Chest pain']


# Map to fDISEASE

In [200]:
rows = []
disease_to_ids = df_dta_disabilities.groupby('fDISEASE')['ID_dta'].apply(set).to_dict()

for disease, disease_ids in disease_to_ids.items():
    for idx, row in df_dw_filtered.iterrows():
        matched_ids = row['Matching Category IDs']
        if not matched_ids:
            continue
        
        filtered_ids = [id_ for id_ in matched_ids if id_ in disease_ids]
        
        if filtered_ids:
            rows.append({
                'Disease': disease,
                'Health State Lay Description': row['Health State Lay Description'],
                'Matching Category IDs': filtered_ids,
                'Matching Category IDs': filtered_ids,
                'Mean' : row['Mean'],
                'Lower' : row['Lower'],
                'Upper' : row['Upper']
            })

df_disease_matches = pd.DataFrame(rows)

  disease_to_ids = df_dta_disabilities.groupby('fDISEASE')['ID_dta'].apply(set).to_dict()


In [201]:
# Only keep the lowest scores

df_exploded = df_disease_matches.explode('Matching Category IDs')

df_deduplicated = (
    df_exploded
    .sort_values('Mean')  # So lowest Mean comes first
    .drop_duplicates(subset=['Disease', 'Matching Category IDs'], keep='first')
)

df_final = (
    df_deduplicated
    .groupby(['Disease', 'Health State Lay Description'])
    .agg({
        'Mean': 'min',
        'Matching Category IDs': lambda x: list(set(x))
    })
    .reset_index()
)

In [202]:
df_final.to_excel('../Data/DW/df_disease_matches.xlsx', index=False)
print(df_final)

               Disease                       Health State Lay Description  \
0       Crohns Disease  has a high fever and pain, and feels very weak...   
1       Crohns Disease  has a swollen belly and swollen legs. the pers...   
2       Crohns Disease  has daily headaches, felt as dull pain and oft...   
3       Crohns Disease                     has low energy and feels cold.   
4       Crohns Disease  has mild mood swings, irritability and some di...   
5       Crohns Disease  has severe weight loss, weakness, fatigue, cou...   
6       Crohns Disease  has weight loss, fatigue, and frequent infecti...   
7       Crohns Disease  is completely blind, which causes great diffic...   
8       Crohns Disease  is hyperactive and has difficulty concentratin...   
9       Crohns Disease  is tired and has itching, cramps, headache, jo...   
10      Crohns Disease     is very tired and irritable and has diarrhea.    
11      Crohns Disease  tires easily, has nausea, reduced appetite and...   

In [203]:
len(df_final)

22

# Calculate Score per disease