In [1]:
import pandas as pd
import os

data_dir = r'D:\Learning (H)\研究生\WUSTL\Semester 3\Practicum in Data Analytics & Statistics ESE 527\Project\Data' 

# Load the data into DataFrames (example structure; replace with actual file paths or database queries)
icu_df = pd.read_csv(os.path.join(data_dir, 'ICUSTAYS.csv'), usecols=['SUBJECT_ID'])
diagnosis_df = pd.read_csv(os.path.join(data_dir, 'DIAGNOSES_ICD.csv'), usecols=['SUBJECT_ID', 'ICD9_CODE'])
icd_diagnoses_df = pd.read_csv(os.path.join(data_dir, 'D_ICD_DIAGNOSES.csv'), usecols=['ICD9_CODE', 'LONG_TITLE'])

top_diagnosis_codes = ['4019', '4280', '42731', '41401', '5849', '25000', '2724', '51881', '5990', '53081']
filtered_df = diagnosis_df[diagnosis_df['ICD9_CODE'].isin(top_diagnosis_codes)]
patient_diagnosis_df = (
    filtered_df
    .assign(is_diagnosed=1)  # Add a column to mark the presence of the diagnosis
    .pivot_table(index='SUBJECT_ID', columns='ICD9_CODE', values='is_diagnosed', fill_value=0)
    .reindex(columns=top_diagnosis_codes, fill_value=0)  # Ensure all top 10 codes are present as columns
    .reset_index()
)

# Join ICUSTAYS with DIAGNOSES_ICD on ICUSTAY_ID
patient_diagnosis_df.columns = ['SUBJECT_ID'] + [f"{code}" for code in top_diagnosis_codes]

top_diagnosis_names_df = pd.DataFrame({
    'ICD9_CODE': top_diagnosis_codes,
    'LONG_TITLE': [
    'Acute Leukemia',
    'Lymphoma',
    'Chronic Leukemia',
    'Lung Cancer',
    'Coronary Disease',
    'Heart Failure',
    'Hypertension',
    'Kidney Disease',
    'Heart Attack',
    'Type 2 Diabetes'
]

})


# Display the resulting DataFrame
rename_mapping = dict(zip(top_diagnosis_names_df['ICD9_CODE'], top_diagnosis_names_df['LONG_TITLE']))
patient_diagnosis_df = patient_diagnosis_df.rename(columns=rename_mapping)


In [4]:
data=pd.read_csv(r'D:\Learning (H)\研究生\WUSTL\Semester 3\Practicum in Data Analytics & Statistics ESE 527\Project\1114\all_patients_data_with_features.csv')
data=data.merge(patient_diagnosis_df,on='SUBJECT_ID', how='left')
data.fillna(value=0)

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,GENDER,AGE,LOS_HOSPITAL,LOS_ICU,HOSPITAL_EXPIRE_FLAG,mean_CVP,mean_DiasBP,...,Acute Leukemia,Lymphoma,Chronic Leukemia,Lung Cancer,Coronary Disease,Heart Failure,Hypertension,Kidney Disease,Heart Attack,Type 2 Diabetes
0,22.0,165315.0,204798.0,0.0,65.0,1.144444,1.143750,0.0,7.859880,73.666667,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23.0,152223.0,227807.0,1.0,71.0,5.496528,1.264074,0.0,9.100000,58.461538,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,25.0,129635.0,203487.0,1.0,59.0,3.534028,3.546574,0.0,10.700000,54.283019,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26.0,197661.0,244882.0,1.0,72.0,6.988889,2.140683,0.0,11.454444,42.136364,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28.0,162569.0,225559.0,1.0,74.0,5.364583,1.122407,0.0,12.000000,45.500000,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30372,98768.0,127022.0,213468.0,0.0,85.0,1.940972,2.151123,1.0,12.703908,69.733333,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
30373,98769.0,141860.0,233981.0,0.0,81.0,10.794444,7.788854,0.0,4.333333,67.083333,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
30374,98794.0,190603.0,262848.0,1.0,78.0,4.145833,2.288796,0.0,9.200000,57.140000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30375,98797.0,105447.0,244147.0,1.0,88.0,0.662500,1.238171,1.0,8.707757,82.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
data.to_csv(r'D:\Learning (H)\研究生\WUSTL\Semester 3\Practicum in Data Analytics & Statistics ESE 527\Project\1114\full_data2.0.csv', index=False)

In [8]:
columns_to_convert = [
    "Acute Leukemia", "Lymphoma", "Chronic Leukemia", "Lung Cancer", 
    "Coronary Disease", "Heart Failure", "Hypertension", 
    "Kidney Disease", "Heart Attack", "Type 2 Diabetes"
]

# Fill NaNs with 0 and convert each specified column to integer type
for column in columns_to_convert:
    data[column].fillna(0, inplace=True)  # Fill NaNs with 0
    data[column] = data[column].astype(int)  # Convert to integer type

data.to_csv(r'D:\Learning (H)\研究生\WUSTL\Semester 3\Practicum in Data Analytics & Statistics ESE 527\Project\1114\full_data2.0.csv', index=False)