# Merge and Data Preperation
This Notebook is Merging  different Dataframes containing features of patients together.
    

In [None]:
##### REQUIRES THE DATAFRAME FOLDER TO BE NAMED 'Cohorts', WHICH INCLUDES ALL PRECOMPUTED DATAFRAMES #####
import fiber
from fiber.cohort import Cohort
from fiber.condition import Patient, MRNs
from fiber.condition import Diagnosis
from fiber.condition import Measurement, Encounter, Drug, TobaccoUse,LabValue
from fiber.storage import yaml as fiberyaml
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import os
import matplotlib.pyplot as plt
from functools import reduce
import json
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import category_encoders as ce


In [None]:
#load data with baseline characteristics
Case_EF_ICD = pq.read_table('Cohort/Phenotyping/ALL_Matches_1yr_HF_EF_ICD_Notes_Cohort.parquet').to_pandas()
Case_EF_ICD=Case_EF_ICD.set_index('MRN', inplace=False)
Case_ICD = pq.read_table('Cohort/Phenotyping/ALL_Matches_1yr_HF_ICD_Notes_Cohort.parquet').to_pandas()
Case_ICD=Case_ICD.set_index('MRN', inplace=False)
Case_all= pd.concat([Case_EF_ICD, Case_ICD], ignore_index=False, sort =False)
Case_all.index = Case_all.index.map(str)
#load all dataframes that should be merged to the cohort and add them to the array: 
df_forMerge=[]
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/HF_ALL_Drugs_Count').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/vascular_cognitive_impairment.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/acute_myocardial_infarction.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/anemia.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/angina.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/atrial_flutter_fibrillation.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/cardiomyopathy.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/chronic_kidney_disease.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/COPD.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/coronary_artery_disease.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/depression.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/diabetes_mellitus_type_I.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/diabetes_mellitus_type_II.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/diabetic_nephropathy.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/dyspnea.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/dysrhythmias.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/edema.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/hyperkalemia.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/hyperlipidemia.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/hypertension.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/obesity.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/peripheral_artery_disease.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/pulmonary_hypertension.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/rheumatic_heart_disease.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/sleep_apnea.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/stroke_broad.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/stroke_hemorrhagic.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/stroke_ischemic.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/transient_ischemic_attack.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/valve_disorder.parquet').to_pandas()) 

#df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/oral_diuretics.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/angiotensin_receptor_blockers.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/beta_blockers.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/entresto.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/IV_diuretics.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/long_acting_nitrates.parquet').to_pandas()) 
df_forMerge.append(pq.read_table('Cohort/Feature_Extraction/Supervised_ALL_HF/mineralocorticoid_receptor_anta.parquet').to_pandas()) 

df_forMerge


In [None]:
col_for_dropping=["age_in_days_icd","Note_ID","age_in_days_term","Term","HF_Onset_age_in_days","HF_Onset_Type"]

In [None]:
col_patient_information=['age_in_days',
'date_of_birth',
'month_of_birth',
'gender',
'religion',
'race',
'patient_ethnic_group',
'deceased_indicator',
'mother_account_number',
'address_zip',
'marital_status_code','medical_record_number']

In [None]:
def merge_dataframes(df_master, df_list, col_for_dropping,col_patient_information, final_name):
    df_patient_information=df_list[0]
    df_patient_information=df_patient_information[col_patient_information]
    df_master=df_master.merge(df_patient_information, left_on='MRN',right_on='medical_record_number',how='inner')
    df_master=df_master.set_index('medical_record_number', inplace=False)
    for x in range(len(df_list)-1):
        x=x+1
        print(x)
        df_merge=df_list[x].drop(col_for_dropping,axis=1)
        #df_merge=dataCuration(df_merge,df_list[x][1])
        df_master.index = df_master.index.map(int)
        #df_master=df_master.astype({'medical_record_number': 'int64'})
        df_master = df_master.merge(df_merge, right_on="medical_record_number", left_index=True, how="inner")
        df_master=df_master.set_index('medical_record_number', inplace=False)
    df_master=df_master.drop(['Note_ID','age_in_days_icd','age_in_days_x','HF_Onset_Type','age_in_days_y','date_of_birth','month_of_birth','Term'],axis=1)   
    #saving the dataframe and a sample: 
    df_master.to_parquet('Cohort/Feature_Extraction/'+final_name+'.parquet')
    sample=df_master.head(1000)
    sample.to_parquet('Cohort/Feature_Extraction/Sample_'+final_name+'.parquet')
    return(df_master)
        


In [None]:
df=merge_dataframes(Case_all, df_forMerge, col_for_dropping,col_patient_information, 'ALL_HF_cohort_supervised_only_ever_diag_drug')

In [None]:
df

In [None]:
df_supervised_merge= pq.read_table('Cohort/Feature_Extraction/ALL_HF_cohort_supervised_only_ever_diag_drug.parquet').to_pandas()
df_supervised_merge=df_supervised_merge.drop([
'gender',
'religion',
'race',
'patient_ethnic_group',
'deceased_indicator',
'mother_account_number',
'address_zip',
'marital_status_code','HF_Onset_age_in_days'],axis=1)
df_supervised_merge
#df_supervised_merge=df_supervised_merge.replace(1, 'yes')
#df_supervised_merge=df_supervised_merge.replace(0,'no')
#df_supervised_merge=df_supervised_merge.fillna('no')
df_supervised_merge.to_parquet('Cohort/Feature_Extraction/ALL_HF_cohort_supervised_only_ever_diag_drugFORMerge.parquet')

In [None]:
df_supervised_merge= pq.read_table('Cohort/Feature_Extraction/ALL_HF_cohort_supervised_only_ever_diag_drugFORMerge.parquet').to_pandas()
df_supervised_merge


In [None]:

df_supervised_merge=df_supervised_merge.replace(1, 'yes')
df_supervised_merge=df_supervised_merge.replace(0, 'no')
df_supervised_merge.to_parquet('Cohort/Feature_Extraction/Supervised_True_false.parquet')

In [None]:
df_supervised_merge

# Remove MRN which do not have any Lab Values 

In [None]:
#load supervised for clustering
clustering=pq.read_table('Cohort/Feature_Extraction/ALL_HF_cohort_supervised_only_ever_diag_drug.parquet').to_pandas()
clustering.index=clustering.index.map(str)
#load supervised for merge
merge=pq.read_table('Cohort/Feature_Extraction/ALL_HF_cohort_supervised_only_ever_diag_drugFORMerge.parquet').to_pandas()
merge.index=merge.index.map(str)
#mrn without labvalues: 
mrn_without_lab=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/LabValue_after_onset_HF_ALL_mmm_0_8_missing_values').to_pandas()

In [None]:
mrn_list=mrn_without_lab['medical_record_number'].to_list()
len(mrn_list)

In [None]:
clustering

In [None]:
clustering_wLab=clustering.drop(mrn_list, inplace=False)
clustering_wLab
clustering_wLab.to_parquet('Cohort/Feature_Extraction/ALL_HF_cohort_supervised_only_ever_diag_drug_wLab.parquet')

In [None]:

merge_wLab=merge.drop(mrn_list, inplace=False)
merge_wLab
merge_wLab.to_parquet('Cohort/Feature_Extraction/ALL_HF_cohort_supervised_only_ever_diag_drugFORMerge_wLab.parquet')

In [None]:
merge_wLab