# Get Baseline Characteristics
This Notebook is extracting the baseline characteristics like age and gender for a specific group of MRNs.

In [None]:
##### REQUIRES THE DATAFRAME FOLDER TO BE NAMED 'Cohorts', WHICH INCLUDES ALL PRECOMPUTED DATAFRAMES #####
import fiber
from fiber.cohort import Cohort
from fiber.condition import Patient, MRNs
from fiber.condition import Diagnosis
from fiber.condition import Measurement, Encounter, Drug, TobaccoUse,LabValue
from fiber.storage import yaml as fiberyaml
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import os
import matplotlib.pyplot as plt
from functools import reduce
import json

In [None]:
#generic function get cohort 
def df_to_cohort(df):
    mrns = list(df.index.values)
    mrns = list(map(str, mrns))
    condition = MRNs(mrns)
    #print(mrns)
    return Cohort(condition)

# Baseline Characteristics for HF ICD &EF Cohort

In [None]:
#load cohort and convert MRN to Index 
notes_ICD_EF_df=pq.read_table('ALL_Matches_1yr_HF_EF_ICD_Notes_Cohort.parquet').to_pandas()
notes_ICD_EF_df_index=notes_ICD_EF_df.set_index('MRN', inplace=False)


In [None]:
notes_ICD_EF_df

In [None]:
# filter all patients over 90 out 
notes_ICD_EF_df_index=notes_ICD_EF_df_index.loc[notes_ICD_EF_df_index['HF_Onset_age_in_days']<32850]
notes_ICD_EF_df_index

In [None]:
cohort_EF_Baseline = df_to_cohort(notes_ICD_EF_df_index)
cohort_EF_Baseline

In [None]:
cohort_EF_Baseline=cohort_EF_Baseline.get(Patient())

In [None]:
cohort_EF_Baseline

In [None]:
notes_ICD_EF_df_index.index = notes_ICD_EF_df_index.index.map(str)

In [None]:
#cohort_EF_Baseline_index=cohort_EF_Baseline.set_index('medical_record_number', inplace=False)


In [None]:
notes_ICD_EF_df_index_baseline=notes_ICD_EF_df_index.merge(cohort_EF_Baseline, left_on='MRN',right_on='medical_record_number',how='inner')

In [None]:
notes_ICD_EF_df_index_baseline

In [None]:
notes_ICD_EF_df_index_baseline.to_parquet('Baseline_Matches_1yr_HF_EF_ICD_Notes_Cohort.parquet')

In [None]:
notes_ICD_EF_df_index_baseline['race'].hist()

In [None]:
#function for statistics: 
def get_base_characteristic_value(df , characteristic , kind):    
    if kind=="mean": 
        df_mean=df[characteristic].mean()
        df_std= df[characteristic].std()
        df_max= df[characteristic].max()
        df_min= df[characteristic].min()
        base_characteristics_cohort=pd.DataFrame({'Variable': [characteristic+"_mean", characteristic+"_std", characteristic+"_max", characteristic+"_min"],
                                                  'Value': [df_mean, df_std, df_max, df_min],})
       
    if kind=="count":
        base_characteristics_cohort=pd.DataFrame(columns=["Variable","Value"])
        feature_value=df[characteristic].unique()
        #print(feature_value)
        for value in feature_value: 
            df_condition=df.loc[df[characteristic]==value]
            df_percent= df_condition.shape[0]/df.shape[0]
            #print(df_percent)
            new_row1 = {'Variable': value+"_total",'Value': df_condition.shape[0]}
            new_row2 = {'Variable': value+"_relation",'Value': df_percent}
            base_characteristics_cohort=base_characteristics_cohort.append(new_row1, ignore_index=True)
            base_characteristics_cohort=base_characteristics_cohort.append(new_row2, ignore_index=True)
       # print(df_condition.shape[0], df_percent)
    #print (base_characteristics_cohort)
    return base_characteristics_cohort

In [None]:
def get_base_characteristics(df, characteristics): 
    base_characteristics_cohort=pd.DataFrame(columns=["Variable","Value"])
    for characteristic in characteristics:
        intermediate_base_characteristics_cohort=get_base_characteristic_value(df,characteristic[0],characteristic[1])
        base_characteristics_cohort=pd.concat([base_characteristics_cohort,intermediate_base_characteristics_cohort])
    print(base_characteristics_cohort)
    return base_characteristics_cohort

In [None]:
base_characteristics=[
    [ "HF_Onset_age_in_days","mean"],
    ["gender","count"]
]

In [None]:
ICD_EF_baseline=get_base_characteristics(notes_ICD_EF_df_index_baseline, base_characteristics)
ICD_EF_baseline

In [None]:
a=pq.read_table('Cohort/Feature_Extraction/ALL_HF_cohort_unsupervised_only_after_onset_HF_ALL_all_any_all_mean_medium_cleaned.parquet').to_pandas()
a.loc[(a['HF_Onset_age_in_days'] > 32850),'HF_Onset_age_in_days']=32850
ICD_EF_baseline=get_base_characteristics(a, base_characteristics)
ICD_EF_baseline

In [None]:
feature_value=notes_ICD_EF_df_index_baseline['gender'].unique()
for value in feature_value: 
    print(notes_ICD_EF_df_index_baseline.loc[notes_ICD_EF_df_index_baseline['gender']==value].shape[0])

#  Baseline Characteristics for HF ICD Cohort

In [None]:
#load cohort and convert MRN to Index 
notes_ICD_df=pq.read_table('ALL_Matches_1yr_HF_ICD_Notes_Cohort.parquet').to_pandas()
notes_ICD_df_index=notes_ICD_df.set_index('MRN', inplace=False)


In [None]:
notes_ICD_df_index

In [None]:
# filter all patients over 90 out 
notes_ICD_df_index=notes_ICD_df_index.loc[notes_ICD_df_index['HF_Onset_age_in_days']<32850]
#notes_ICD_df_index.loc[notes_ICD_df_index['HF_Onset_age_in_days']>32850]
notes_ICD_df_index

In [None]:
cohort_ICD_Baseline = df_to_cohort(notes_ICD_df_index)
cohort_ICD_Baseline

In [None]:
cohort_ICD_Baseline=cohort_ICD_Baseline.get(Patient())

In [None]:
cohort_ICD_Baseline

In [None]:
notes_ICD_df_index.index = notes_ICD_df_index.index.map(str)

In [None]:
notes_ICD_df_index_baseline=notes_ICD_df_index.merge(cohort_ICD_Baseline, left_on='MRN',right_on='medical_record_number',how='inner')

In [None]:
notes_ICD_df_index_baseline

In [None]:
notes_ICD_df_index_baseline.to_parquet('Baseline_Matches_1yr_HF_ICD_Notes_Cohort.parquet')

In [None]:
ICD_baseline=get_base_characteristics(notes_ICD_df_index_baseline, base_characteristics)