In [None]:
import pandas as pd
import re 
import numpy as np

# Function to extract numeric dose values from strings like "10 MG"
def extract_numeric_dose(dose):
    if isinstance(dose, str):
        match = re.search(r"[\d]+(?:\.\d+)?", dose)  
        return float(match.group()) if match else None
    return dose 

# Map full medication names back to their generic form (CLOZAPINE, RISPERIDONE, OLANZAPINE)
def get_medication_generic_name(med_name):
    for med in medications:
        if med in med_name:
            return med
    return None

# Function to check if any vital keyword is in "FLO_DISPLAY_NM"
def contains_vital(vital_name, target_list):
    return any(target in vital_name for target in target_list)

# Function to extract numeric values from vitals data
def extract_numeric_value(value):
    if isinstance(value, str):
        match = re.search(r"[\d]+(?:\.\d+)?", value)  # Extract number including decimals
        return float(match.group()) if match else None
    return value  # If already a number, return as is

# Map full FLO_DISPLAY_NM names back to generic names (HEIGHT, WEIGHT, BMI)
def get_vital_name(display_name):
    for vital in vitals:
        if vital in display_name:
            return vital
    return None


file_name = "data/S001_u7.5.2024_DI_from_3.2024_Merged_Demo_Geo_by_Tithi.xlsx"

columns_to_use = [
    'ID', 
    'GENDER', 
    'demo_age', 
    'PRIMARY_RACE', 
    'LANGUAGE', 
    'PRIMARY_ETHNICITY', 
    'D_Insur_at_pull', 
    'RPL_THEME1'
]

df = pd.read_excel(file_name, usecols=columns_to_use)


df['PRIMARY_RACE'] = df['PRIMARY_RACE'].replace(
    ["Declined / Not Available", "Choose not to Answer", "Unknown", ''],
    'Unknown'
)

df['LANGUAGE'] = df['LANGUAGE'].fillna('Unknown')
df['LANGUAGE'] = df['LANGUAGE'].replace(r'^\s*$', 'Unknown', regex=True)

df['PRIMARY_ETHNICITY'] = df['PRIMARY_ETHNICITY'].replace(
    ["Patient Refused", "Patient chooses not to answer", "Unknown/Not Specified",""],
    'Unknown'
)

df['D_Insur_at_pull'] = df['D_Insur_at_pull'].fillna('Unknown')
df['D_Insur_at_pull'] = df['D_Insur_at_pull'].replace(r'^\s*$', 'Unknown', regex=True)



categorical_columns = ['GENDER', 'PRIMARY_RACE', 'LANGUAGE', 'PRIMARY_ETHNICITY', 'D_Insur_at_pull']
df_encoded = pd.get_dummies(df, columns=categorical_columns)

df_encoded["RPL_THEME1"] = pd.to_numeric(df_encoded["RPL_THEME1"], errors="coerce")  # Convert non-numeric to NaN
df_encoded.loc[(df_encoded["RPL_THEME1"] < 0) | (df_encoded["RPL_THEME1"] > 1), "RPL_THEME1"] = np.nan  # Replace invalid values


# Load the Excel spreadsheet
data_path = "data/anonymized_H-43413 Qualifying Encounters.xlsx"
df_icd10 = pd.read_excel(data_path)

icd10_codes = ['F28', 'F20.5', 'F20.2', 'F21', 'F23', 'F24', 'F20.81', 'F25.8', 'F20.1']

df_icd10_binary = df_icd10.pivot_table(index='ID', columns='ICD-10', aggfunc='size', fill_value=0)
df_icd10_binary = df_icd10_binary.reindex(columns=icd10_codes, fill_value=0)
df_icd10_binary.reset_index(inplace=True)
df_encoded = df_encoded.merge(df_icd10_binary, on="ID", how="left")
df_encoded.fillna(0, inplace=True)
df_encoded[icd10_codes] = (df_encoded[icd10_codes] > 0).astype(int)


# Load the Excel file
file_path = "data/anonymized_H-43413 Amb Med Data.xlsx"
df_med = pd.read_excel(file_path)

medications = ["CLOZAPINE", "RISPERIDONE", "OLANZAPINE"]
df_med["MEDICATION NAME"] = df_med["MEDICATION NAME"].str.upper()
def contains_medication(med_name, target_list):
    return any(target in med_name for target in target_list)
df_filtered = df_med[df_med["MEDICATION NAME"].apply(lambda x: contains_medication(x, medications))].copy()


df_filtered["DOSE"] = df_filtered["DOSE"].apply(extract_numeric_dose)
df_filtered = df_filtered.dropna(subset=["DOSE"])
df_filtered["DOSE"] = df_filtered["DOSE"].astype(float)


df_filtered["GENERIC_MED_NAME"] = df_filtered["MEDICATION NAME"].apply(get_medication_generic_name)

df_stats = df_filtered.groupby(["ID", "GENERIC_MED_NAME"])["DOSE"].agg(["min", "max", "mean"]).reset_index()

# Pivot table to create separate columns for each medication and statistic
df_pivot = df_stats.pivot(index="ID", columns="GENERIC_MED_NAME", values=["min", "max", "mean"])

df_pivot.columns = [f"{stat}_{med}" for stat, med in df_pivot.columns]
df_pivot.reset_index(inplace=True)

for med in medications:
    for stat in ["min", "max", "mean"]:
        col_name = f"{stat}_{med}"
        if col_name not in df_pivot.columns:
            df_pivot[col_name] = 0  

df_encoded = df_encoded.merge(df_pivot, on="ID", how="left")

# Replace NaN values in the 9 medication-related columns with 0
for med in medications:
    for stat in ["min", "max", "mean"]:
        col_name = f"{stat}_{med}"
        df_encoded[col_name] = df_encoded[col_name].fillna(0)

# Load the vitals CSV file
vitals_file_path = "data/anonymized_H43413_vitals.csv"
df_vitals = pd.read_csv(vitals_file_path)
df_vitals["FLO_DISPLAY_NM"] = df_vitals["FLO_DISPLAY_NM"].str.upper()
vitals = ["HEIGHT", "WEIGHT", "BMI", "PULSE"]

df_filtered = df_vitals[df_vitals["FLO_DISPLAY_NM"].apply(lambda x: contains_vital(x, vitals))].copy()
df_filtered["ENTERED_VALUE"] = df_filtered["ENTERED_VALUE"].apply(extract_numeric_value)
df_filtered = df_filtered.dropna(subset=["ENTERED_VALUE"])
df_filtered["ENTERED_VALUE"] = df_filtered["ENTERED_VALUE"].astype(float)
df_filtered["VITAL_NAME"] = df_filtered["FLO_DISPLAY_NM"].apply(get_vital_name)

df_stats = df_filtered.groupby(["ID", "VITAL_NAME"])["ENTERED_VALUE"].agg(["min", "max", "mean"]).reset_index()

df_pivot = df_stats.pivot(index="ID", columns="VITAL_NAME", values=["min", "max", "mean"])
df_pivot.columns = [f"{stat}_{vital}" for stat, vital in df_pivot.columns]
df_pivot.reset_index(inplace=True)

# Ensure all vital-related variables exist in df_encoded, even if missing in df_pivot
for vital in vitals:
    for stat in ["min", "max", "mean"]:
        col_name = f"{stat}_{vital}"
        if col_name not in df_pivot.columns:
            df_pivot[col_name] = 0  # Fill missing columns with 0

# Merge with df_encoded
df_encoded = df_encoded.merge(df_pivot, on="ID", how="left")

# Replace NaN values in the new columns with 0
for vital in vitals:
    for stat in ["min", "max", "mean"]:
        col_name = f"{stat}_{vital}"
        df_encoded[col_name] = df_encoded[col_name].fillna(0)


# Load the anonymized blood pressure data
bp_df = pd.read_csv("data/anonymized_H43413_bp.csv")

bp_df['ID'] = bp_df['ID'].astype(str)
df_encoded['ID'] = df_encoded['ID'].astype(str)

# Compute blood pressure statistics for each patient
bp_stats = bp_df.groupby('ID').agg({
    'SYSTOLIC_BP': ['min', 'max', 'mean'],
    'DIASTOLIC_BP': ['min', 'max', 'mean']
}).reset_index()

bp_stats.columns = ['ID', 'SYSTOLIC_BP_min', 'SYSTOLIC_BP_max', 'SYSTOLIC_BP_mean',
                    'DIASTOLIC_BP_min', 'DIASTOLIC_BP_max', 'DIASTOLIC_BP_mean']

df_encoded = df_encoded.merge(bp_stats, on='ID', how='left')


# lab test data
lab_data_path = "data/anonymized_h43413_labs.csv"
lab_data = pd.read_csv(lab_data_path)

lab_tests = ["CREATININE", "BMC_GLUCOSE", "BMC_ALT(SGPT)", "BMC_AST(SGOT)"]

filtered_labs = lab_data[lab_data["RESULT_TEST_NM"].isin(lab_tests)]
filtered_labs["RESULT_VALUE_NUM"] = pd.to_numeric(filtered_labs["RESULT_VALUE_NUM"], errors="coerce")

patient_ids = df_encoded["ID"].unique()

filtered_labs = filtered_labs[filtered_labs["ID"].isin(patient_ids)]

lab_summary = (
    filtered_labs.groupby(["ID", "RESULT_TEST_NM"])["RESULT_VALUE_NUM"]
    .agg(["min", "max", "mean"])
    .reset_index()
)

lab_summary = lab_summary.pivot(index="ID", columns="RESULT_TEST_NM", values=["min", "max", "mean"])

lab_summary.columns = [f"{stat}_{test}" for stat, test in lab_summary.columns]
lab_summary.reset_index(inplace=True)

df_encoded = df_encoded.merge(lab_summary, on="ID", how="left")


# Problem list
problem_list_df = pd.read_excel("data/anonymized_H-43413 Data Add On.xlsx", sheet_name="Problem List")

problem_list_df['ID'] = problem_list_df['ID'].astype(str)
df_encoded['ID'] = df_encoded['ID'].astype(str)

# List of ICD-10 codes to check
icd10_codes = [
    "F17.200", "F20.9", "F39", "F25.9", "F43.10", "F29", "F32.A", "F41.1", "F41.9", "F11.20", "F31.9", "F22", 
    "F25.0", "F20.0", "F33.1", "F19.10", "F10.20", "F10.10", "F43.20", "F14.10", "F32.9", "F17.210", "F14.20", 
    "F99", "F25.1", "F12.10", "F19.90", "F90.9", "F33.9", "F11.21", "F33.3", "F10.21", "F11.90", "F12.20", 
    "F43.21", "F19.20", "F20.3", "F10.11", "F41.8", "F41.0", "F33.2", "F20.89", "F34.1"
]

df_encoded[icd10_codes] = 0

for index, row in problem_list_df.iterrows():
    patient_id = row['ID']
    icd_list = str(row['CURRENT ICD-10 LIST']).split(', ')
    
    if patient_id in df_encoded['ID'].values:
        for icd_code in icd10_codes:
            if icd_code in icd_list:
                df_encoded.loc[df_encoded['ID'] == patient_id, icd_code] = 1

def update_primary_race(df):
    # Process values for race and ethinicity
    df.loc[df['PRIMARY_RACE_Chinese'] == 1, 'PRIMARY_RACE_Asian'] = 1
    
    df.loc[df['PRIMARY_RACE_Native Hawaiian / Other Pacific Islander'] == 1, 'PRIMARY_RACE_Native Hawaiian / Pacific Islander'] = 1

    df.loc[df['PRIMARY_RACE_Other Pacific Islander'] == 1, 'PRIMARY_RACE_Native Hawaiian / Pacific Islander'] = 1
    
    df.loc[df['PRIMARY_RACE_American Indian or Alaskan Native'] == 1, 'PRIMARY_RACE_American Indian / Native American'] = 1
    
    df.loc[df['PRIMARY_ETHNICITY_Middle Eastern'] == 1, 'PRIMARY_RACE_Middle Eastern'] = 1
    
    df.loc[df[['PRIMARY_ETHNICITY_Russian', 'PRIMARY_ETHNICITY_American', 'PRIMARY_ETHNICITY_European']].sum(axis=1) > 0, 'PRIMARY_RACE_White'] = 1
    
    df.loc[df['PRIMARY_ETHNICITY_Asian Indian'] == 1, 'PRIMARY_RACE_Asian Indian'] = 1
    
    df.loc[df[['PRIMARY_ETHNICITY_African', 'PRIMARY_ETHNICITY_African American', 'PRIMARY_ETHNICITY_Cape Verdean', 'PRIMARY_ETHNICITY_Caribbean Islander', 'PRIMARY_ETHNICITY_Haitian', 'PRIMARY_ETHNICITY_Middle Eastern or North African']].sum(axis=1) > 0, 'PRIMARY_RACE_Black / African American'] = 1
    
    df.loc[df[['PRIMARY_ETHNICITY_Cambodian', 'PRIMARY_ETHNICITY_Chinese', 'PRIMARY_ETHNICITY_Filipino', 'PRIMARY_ETHNICITY_Japanese', 'PRIMARY_ETHNICITY_Korean', 'PRIMARY_ETHNICITY_Laotian', 'PRIMARY_ETHNICITY_Vietnamese']].sum(axis=1) > 0, 'PRIMARY_RACE_Asian'] = 1
    
    race_columns = [col for col in df.columns if col.startswith('PRIMARY_RACE_') and col != 'PRIMARY_RACE_Unknown']
    df['PRIMARY_RACE_Unknown'] = (df[race_columns].sum(axis=1) == 0).astype(int)
    
    ethnicity_columns = [col for col in df.columns if col.startswith('PRIMARY_ETHNICITY_') and col != 'PRIMARY_ETHNICITY_Unknown']
    df['PRIMARY_ETHNICITY_Unknown'] = (df[ethnicity_columns].sum(axis=1) == 0).astype(int)
    
    # Drop the specified columns
    df.drop(columns=['PRIMARY_RACE_Chinese', 'PRIMARY_RACE_Other Pacific Islander', 'PRIMARY_RACE_Native Hawaiian / Other Pacific Islander', 'PRIMARY_RACE_American Indian or Alaskan Native'], inplace=True)
    
    return df

df_encoded = update_primary_race(df_encoded)
print(df_encoded.head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_labs["RESULT_VALUE_NUM"] = pd.to_numeric(filtered_labs["RESULT_VALUE_NUM"], errors="coerce")


      ID  demo_age  RPL_THEME1  GENDER_F  GENDER_M  GENDER_U  \
0  10000        49        0.49      True     False     False   
1  10001        67        0.75      True     False     False   
2  10002        80        0.79      True     False     False   
3  10004        83        0.55      True     False     False   
4  10005        82        0.66     False      True     False   

  PRIMARY_RACE_American Indian / Native American PRIMARY_RACE_Asian  \
0                                          False              False   
1                                          False              False   
2                                          False              False   
3                                          False              False   
4                                          False              False   

  PRIMARY_RACE_Asian Indian PRIMARY_RACE_Black / African American  ...  \
0                     False                                     1  ...   
1                     False             

  df.loc[df['PRIMARY_RACE_Chinese'] == 1, 'PRIMARY_RACE_Asian'] = 1
  df.loc[df['PRIMARY_RACE_Native Hawaiian / Other Pacific Islander'] == 1, 'PRIMARY_RACE_Native Hawaiian / Pacific Islander'] = 1
  df.loc[df['PRIMARY_RACE_American Indian or Alaskan Native'] == 1, 'PRIMARY_RACE_American Indian / Native American'] = 1
  df.loc[df['PRIMARY_ETHNICITY_Middle Eastern'] == 1, 'PRIMARY_RACE_Middle Eastern'] = 1
  df.loc[df[['PRIMARY_ETHNICITY_Russian', 'PRIMARY_ETHNICITY_American', 'PRIMARY_ETHNICITY_European']].sum(axis=1) > 0, 'PRIMARY_RACE_White'] = 1
  df.loc[df['PRIMARY_ETHNICITY_Asian Indian'] == 1, 'PRIMARY_RACE_Asian Indian'] = 1
  df.loc[df[['PRIMARY_ETHNICITY_African', 'PRIMARY_ETHNICITY_African American', 'PRIMARY_ETHNICITY_Cape Verdean', 'PRIMARY_ETHNICITY_Caribbean Islander', 'PRIMARY_ETHNICITY_Haitian', 'PRIMARY_ETHNICITY_Middle Eastern or North African']].sum(axis=1) > 0, 'PRIMARY_RACE_Black / African American'] = 1


In [None]:
import pandas as pd

# Function to get valid IDs from a spreadsheet
def get_valid_ids(file_path):
    try:
        
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path, usecols=['ID'])  # Load only ID column
        elif file_path.endswith('.xlsx'):
            df = pd.read_excel(file_path, usecols=['ID'])  # Load only ID column
        else:
            print(f"Unsupported file format: {file_path}")
            
            return set()
        # Load spreadsheet (CSV or Excel)
        if file_path ==  "data/anonymized_H-43413 Data Add On.xlsx":
            df = pd.read_excel("data/anonymized_H-43413 Data Add On.xlsx", sheet_name="Problem List", usecols=['ID'])
        

        return set(df['ID'].dropna().astype(str))  # Convert to string and remove NaNs
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return set()
    
# File paths
file_paths = [
    "data/anonymized_H-43413 Amb Med Data.xlsx",
    "data/anonymized_H43413_vitals.csv",
    "data/anonymized_H-43413 Qualifying Encounters.xlsx",
    "data/anonymized_h43413_labs.csv",
    "data/anonymized_H-43413 Data Add On.xlsx",
    "data/anonymized_H43413_bp.csv"
]

df_encoded_ids = set(df_encoded['ID'].astype(str)) 

valid_ids = df_encoded_ids 
for file in file_paths:
    valid_ids &= get_valid_ids(file) 
df_encoded = df_encoded[df_encoded['ID'].astype(str).isin(valid_ids)]

df_encoded = df_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


  df_encoded = df_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))


In [3]:
df_encoded.to_pickle("df_encoded.pkl")