# Merge and Data Preperation
This Notebook is Merging the different Dimensions and applies data preparation Methods. 

The User can choose between replacing all NaN Values with Zeros or doing simple Imputation 
    

In [None]:
##### REQUIRES THE DATAFRAME FOLDER TO BE NAMED 'Cohorts', WHICH INCLUDES ALL PRECOMPUTED DATAFRAMES #####
import fiber
from fiber.cohort import Cohort
from fiber.condition import Patient, MRNs
from fiber.condition import Diagnosis
from fiber.condition import Measurement, Encounter, Drug, TobaccoUse,LabValue
from fiber.storage import yaml as fiberyaml
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import os
import matplotlib.pyplot as plt
from functools import reduce
import json
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import category_encoders as ce


In [None]:
#load data with baseline characteristics
Case_EF_ICD = pq.read_table('Cohort/Phenotyping/ALL_Matches_1yr_HF_EF_ICD_Notes_Cohort.parquet').to_pandas()
Case_EF_ICD=Case_EF_ICD.set_index('MRN', inplace=False)
Case_ICD = pq.read_table('Cohort/Phenotyping/ALL_Matches_1yr_HF_ICD_Notes_Cohort.parquet').to_pandas()
Case_ICD=Case_ICD.set_index('MRN', inplace=False)
Case_all= pd.concat([Case_EF_ICD, Case_ICD], ignore_index=False, sort =False)
Case_all.index = Case_all.index.map(str)
#load all dataframes that should be merged to the cohort and add them to the array: 
df_forMerge=[]

r=((pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Drug_after_onset_HF_ALL_mmm_0_4').to_pandas()),'')
df_forMerge.append(r)
r=((pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Diagnosis_after_onset_HF_ALL_mmm_0_4_cleaned').to_pandas()),'')
df_forMerge.append(r)
#r=((pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Procedure_after_onset_HF_ALL_mmm_0_8_cleaned').to_pandas()),'')
#df_forMerge.append(r)


r=((pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/VitalSign_after_onset_HF_ALL_mmm_0_6_cleaned').to_pandas()),'Imputation')
df_forMerge.append(r)
r=((pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/LabValue_after_onset_HF_ALL_mmm_0_8_cleaned').to_pandas()),'Imputation')
df_forMerge.append(r)


In [None]:
test=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Diagnosis_after_onset_HF_ALL_mmm_0_2').to_pandas()
test

In [None]:
for x in range(len(df_forMerge)):
    t=df_forMerge[x][0]
    print(t)
    #t.shape()

In [None]:
col_for_dropping=['age_in_days',
'date_of_birth',
'month_of_birth',
'gender',
'religion',
'race',
'patient_ethnic_group',
'deceased_indicator',
'mother_account_number',
'address_zip',
'marital_status_code']

In [None]:
col_patient_information=['age_in_days',
'date_of_birth',
'month_of_birth',
'gender',
'religion',
'race',
'patient_ethnic_group',
'deceased_indicator',
'mother_account_number',
'address_zip',
'marital_status_code','medical_record_number']

In [None]:
def dataCuration(df,method): 
    if method=='NaNToZero':
        #fill the nans with 0
        df=df.fillna(0)
        
    if method=='Imputation':
        #drop mrns  which will be later added again
        mrn=pd.DataFrame()
        mrn['medical_record_number']=df['medical_record_number']
        df=df.drop('medical_record_number',axis=1)
        
        #apply StandardScaler
        numerical_cols = [c for c in df.columns if df[c].dtype in [np.float, np.int] ]
        ctransformer = ColumnTransformer([
            ('num', SimpleImputer(), numerical_cols)])
            #('num', SimpleImputer(), df.columns)])
        transformed_df =ctransformer.fit_transform(df)
        #print(numerical_cols)
        #print(transformed_df.shape)
        df=pd.DataFrame(transformed_df[0:,0:],columns=df.columns)
        #print(len(mrn))
        #print(len(df))
        df['medical_record_number']=mrn['medical_record_number']   
    else: 
        df=df
        
    return df 

In [None]:
def merge_dataframes(df_master, df_list, col_for_dropping,col_patient_information, final_name):
    df_patient_information=df_list[0][0]
    df_patient_information=df_patient_information[col_patient_information]
    df_master=df_master.merge(df_patient_information, left_on='MRN',right_on='medical_record_number',how='inner')
    df_master=df_master.set_index('medical_record_number', inplace=False)
    for x in range(len(df_list)):
        print(x)
        try:
            df_merge=df_list[x][0].drop(col_for_dropping,axis=1)
        except: 
            df_merge=df_list[x][0]
        df_merge=dataCuration(df_merge,df_list[x][1])
        df_master = df_master.merge(df_merge, right_on="medical_record_number", left_index=True, how="inner")
        df_master=df_master.set_index('medical_record_number', inplace=False)
    df_master=df_master.drop(['Note_ID','age_in_days_icd','age_in_days_x','HF_Onset_Type','age_in_days_y','date_of_birth','month_of_birth','Term'],axis=1)   
    #saving the dataframe and a sample: 
    df_master.to_parquet('Cohort/Feature_Extraction/'+final_name+'.parquet')
    sample=df_master.head(1000)
    sample.to_parquet('Cohort/Feature_Extraction/Sample_'+final_name+'.parquet')
    return(df_master)
        


In [None]:
df=merge_dataframes(Case_all, df_forMerge, col_for_dropping,col_patient_information, 'ALL_HF_cohort_unsupervised_only_after_onset_HF_ALL_all_any_all_mean_medium_cleaned2')

In [None]:
#Merge with supervised features : 
df_unsupervised=pq.read_table('Cohort/Feature_Extraction/ALL_HF_cohort_unsupervised_only_after_onset_HF_ALL_all_any_all_mean_medium_cleaned.parquet').to_pandas()
df_unsupervised
df_supervised_merge= pq.read_table('Cohort/Feature_Extraction/Supervised_True_false.parquet').to_pandas()

df_unsupervised.index = df_unsupervised.index.map(str)
df_supervised_merge.index = df_supervised_merge.index.map(str)
df_unsupervised=pd.merge(df_unsupervised, df_supervised_merge, left_on='medical_record_number', right_on='medical_record_number')
df_unsupervised

In [None]:
df_unsupervised.to_parquet('Cohort/Feature_Extraction/ALL_HF_cohort_unsupervised_only_after_onset_HF_ALL_all_any_all_mean_medium_cleaned_wSupervised.parquet')

In [None]:
df

In [None]:
#load vital signs
df_vital_sign = pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/HF_ALL_Vital_Sign_Mean').to_pandas()
df_vital_sign

In [None]:
df_forMerge[4]

In [None]:
r=((pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/LabValue_after_onset_HF_ALL_mmm_0_6').to_pandas()))
print(r.shape)
r=(pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/VitalSign_after_onset_HF_ALL_mmm_0_6').to_pandas())
print(r.shape)
r=((pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Diagnosis_after_onset_HF_ALL_mmm_0_4').to_pandas()))
print(r.shape)
r=((pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Procedures_after_onset_HF_ALL_mmm_0_4').to_pandas()))
print(r.shape)
r=((pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Drug_after_onset_HF_ALL_mmm_0_4').to_pandas()))
print(r.shape)

# Remove MRN which do not have any Lab Values 

In [None]:
#load small 
small=pq.read_table('Cohort/Feature_Extraction/ALL_HF_cohort_unsupervised_only_after_onset_HF_ALL_all_any_all_mean_small_cleaned.parquet').to_pandas()

#load medium dataset
medium=pq.read_table('Cohort/Feature_Extraction/ALL_HF_cohort_unsupervised_only_after_onset_HF_ALL_all_any_all_mean_medium_cleaned.parquet').to_pandas()

#mrn without labvalues: 
mrn_without_lab=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/LabValue_after_onset_HF_ALL_mmm_0_8_missing_values').to_pandas()

In [None]:
mrn_list=mrn_without_lab['medical_record_number'].to_list()
len(mrn_list)

In [None]:
small_wLab=small.drop(mrn_list, inplace=False)
small_wLab
small_wLab.to_parquet('Cohort/Feature_Extraction/ALL_HF_cohort_unsupervised_only_after_onset_HF_ALL_all_any_all_mean_small_cleaned_wLab.parquet')

In [None]:
medium_wLab=medium.drop(mrn_list, inplace=False)
medium_wLab
medium_wLab.to_parquet('Cohort/Feature_Extraction/ALL_HF_cohort_unsupervised_only_after_onset_HF_ALL_all_any_all_mean_medium_cleaned_wLab.parquet')