In [2]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [37]:
df_angela = pd.read_csv("Input/cleaned_data_AJ_v2.csv")
df_may = pd.read_csv("Input/cleaned_columns_MK_v2.csv")

### Append cleaned datasets together

In [64]:
df_final = pd.merge(df_angela, df_may, on='Unnamed: 0')

In [65]:
## Final cleaning

# Replace the 'age' column values with their corresponding medians
df_final['age'] = df_final['age'].str.replace(r'\[|\)', '', regex=True)
age_range_mapping = {
    '0-10': 5,
    '10-20': 15,
    '20-30': 25,
    '30-40': 35,
    '40-50': 45,
    '50-60': 55,
    '60-70': 65,
    '70-80': 75,
    '80-90': 85,
    '90-100': 95
}
df_final['age'] = df_final['age'].map(age_range_mapping)

# Drop rows where discharge is death/hospice (cannot be readmitted): discharge_code = 11, 13, 14, 19, 20, 21
df_final = df_final[~df_final['discharge_disposition_id'].isin([11, 13, 14, 19, 20, 21])]

# Drop rows where gender is unknown
df_final = df_final[df_final['gender'] != 'Unknown/Invalid']

## Standardize missing values notation
df_final.replace(["?", "None"], [None, None], inplace=True)

### Drop old columns

In [68]:
# med_columns = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
#        'glimepiride', 
# #               'acetohexamide', 
#                'glipizide', 'glyburide', 'tolbutamide',
#        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 
# #               'troglitazone',
#        'tolazamide', 
# #                'examide', 'citoglipton', 
#                'insulin',
#        'glyburide-metformin', 'glipizide-metformin',
# #       'glimepiride-pioglitazone', 'metformin-rosiglitazone','metformin-pioglitazone'
#               ]
med_columns=[]

discharge_columns = ['discharge_disposition_id']
admission_columns = ['admission_type_id', 'admission_source_id']
diagnosis_columns = ['diag_1', 'diag_2', 'diag_3']
test_columns = ['max_glu_serum']
change_columns = ['change', 'med_change_agg']

In [69]:
columns_to_drop = med_columns + discharge_columns + admission_columns + diagnosis_columns + test_columns + change_columns
df_final = df_final.drop(columns=columns_to_drop)

In [70]:
df_final.to_csv("Final cleaned data.csv")