# Clean unsupervised features
This Notebook can be used to clean the unsupervised extracted features.For numerical features(Lab and Vital Sign) It is dropping features were a certain threshold of patients do not have this features and is replacing outlier values with NaN so that they can be later imputed Furthermore are measures for the same type of test out of different Systems like IBEX or EPIC are dropped.

For Categorical Features(Medication Diagnosis and procedures) the user can choose which further features should be dropped, by entering the feature name. 

In [1]:
##### REQUIRES THE DATAFRAME FOLDER TO BE NAMED 'Cohorts', WHICH INCLUDES ALL PRECOMPUTED DATAFRAMES #####
import fiber
from fiber.cohort import Cohort
from fiber.condition import Patient, MRNs
from fiber.condition import Diagnosis
from fiber.condition import Measurement, Encounter, Drug, TobaccoUse,LabValue
from fiber.storage import yaml as fiberyaml
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import os
import matplotlib.pyplot as plt
from functools import reduce
import json
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import category_encoders as ce

DB Password: ········


# Quantile function 

In [None]:
#dropping numeric values if over 50% of the patients have nan values 
# replacing outliers with NaN values so that they can be imputed afterwards
def cleanNumFeatures(df,featurename): 
    feature=featurename
    initial_nan=df[feature].isna().sum()
    if initial_nan>12000:
        return False 
    print(initial_nan)
    test=df 
    test=test[feature].dropna()
    upper_quantile=np.quantile(test.values,0.995,axis=0)
    lower_quantile=np.quantile(test.values,0.005,axis=0)
    df.loc[df[feature]>=upper_quantile,[feature]]=np.nan
    df.loc[df[feature]<=lower_quantile,[feature]]=np.nan
    if df[feature].isna().sum()>=18000: 
        return False
    return True

In [None]:
col_for_dropping=['age_in_days',
'date_of_birth',
'month_of_birth',
'gender',
'religion',
'race',
'patient_ethnic_group',
'deceased_indicator',
'mother_account_number',
'address_zip',
'marital_status_code']

# Lab Values

In [None]:
lab_df=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/LabValue_after_onset_HF_ALL_mmm_0_8').to_pandas()
#drop mrns  which will be later added again
further_col_drop=[]
mrn=pd.DataFrame()
mrn['medical_record_number']=lab_df['medical_record_number']
lab_df=lab_df.drop('medical_record_number',axis=1)
lab_df=lab_df.drop(col_for_dropping,axis=1)
lab_df
col_names=lab_df.columns
for c in col_names: 
    print(c)
    keep_column=cleanNumFeatures(lab_df,c)
    if keep_column==False:
        #lab_df=lab_df.drop(c,axis=1)
        further_col_drop.append(c)
        print('drop')
    #print(lab_df)
lab_df=lab_df.drop(further_col_drop,axis=1)
lab_df['medical_record_number']=mrn['medical_record_number']

In [None]:
lab_df

In [None]:
lab_df.to_parquet('Cohort/Feature_Extraction/Unsupervised_ALL_HF/LabValue_after_onset_HF_ALL_mmm_0_8_cleaned')

## Check for NaNs in row 

In [None]:
lab_df_clean=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/LabValue_after_onset_HF_ALL_mmm_0_8_cleaned').to_pandas()
lab_df_clean

In [None]:
d=lab_df_clean.isnull().sum(axis=1)

In [None]:
import matplotlib.pyplot as plt
plt.hist(x=d, bins=137, color='#0504aa',alpha=0.7, rwidth=0.85)

# Exclude Patient with no Lab Values

In [None]:
lab_df_clean['count_missing']=d
lab_df_clean

In [None]:
lab_df_missing=lab_df_clean.loc[lab_df_clean['count_missing']>=136]
lab_df_missing.to_parquet('Cohort/Feature_Extraction/Unsupervised_ALL_HF/LabValue_after_onset_HF_ALL_mmm_0_8_missing_values')

# Vital Signs 

In [None]:
vital_df=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/VitalSign_after_onset_HF_ALL_mmm_0_6').to_pandas()
#drop mrns  which will be later added again
further_col_drop=[]
mrn=pd.DataFrame()
mrn['medical_record_number']=vital_df['medical_record_number']
vital_df=vital_df.drop('medical_record_number',axis=1)
vital_df=vital_df.drop(col_for_dropping,axis=1)
vital_df
col_names=vital_df.columns
for c in col_names: 
    print(c)
    keep_column=cleanNumFeatures(vital_df,c)
    if keep_column==False:
        #lab_df=lab_df.drop(c,axis=1)
        further_col_drop.append(c)
        print('drop')
    #print(lab_df)
vital_df=vital_df.drop(further_col_drop,axis=1)
vital_df['medical_record_number']=mrn['medical_record_number']

In [None]:
vital_df

In [None]:
#keep only the EPIC values since all values out of the different values are the same
further_col_drop=['max__VitalSign__IBEX__(RESP)',
 'max__VitalSign__IBEX__(T-O)',
 'max__VitalSign__IBEX__(T-T)',
 'max__VitalSign__IBEX__DBP',
 'max__VitalSign__IBEX__PULSE',
 'max__VitalSign__IBEX__PULSE OXIMETRY',
 'max__VitalSign__IBEX__SBP',
 'max__VitalSign__IBEX__TEMPERATURE',
 'max__VitalSign__TDS__(RESP)',
 'max__VitalSign__TDS__(T-O)',
 'max__VitalSign__TDS__(T-T)',
 'max__VitalSign__TDS__DBP',
 'max__VitalSign__TDS__SBP',
 'median__VitalSign__IBEX__(RESP)',
 'median__VitalSign__IBEX__(T-O)',
 'median__VitalSign__IBEX__(T-T)',
 'median__VitalSign__IBEX__DBP',
 'median__VitalSign__IBEX__PULSE',
 'median__VitalSign__IBEX__PULSE OXIMETRY',
 'median__VitalSign__IBEX__SBP',
 'median__VitalSign__IBEX__TEMPERATURE',
 'median__VitalSign__TDS__(RESP)',
 'median__VitalSign__TDS__(T-O)',
 'median__VitalSign__TDS__(T-T)',
 'median__VitalSign__TDS__DBP',
 'median__VitalSign__TDS__SBP',
 'min__VitalSign__IBEX__(RESP)',
 'min__VitalSign__IBEX__(T-O)',
 'min__VitalSign__IBEX__(T-T)',
 'min__VitalSign__IBEX__DBP',
 'min__VitalSign__IBEX__PULSE',
 'min__VitalSign__IBEX__PULSE OXIMETRY',
 'min__VitalSign__IBEX__SBP',
 'min__VitalSign__IBEX__TEMPERATURE',
 'min__VitalSign__TDS__(RESP)',
 'min__VitalSign__TDS__(T-O)',
 'min__VitalSign__TDS__(T-T)',
 'min__VitalSign__TDS__DBP',
 'min__VitalSign__TDS__SBP']

In [None]:
vital_df=vital_df.drop(further_col_drop,axis=1)

In [None]:
vital_df.to_parquet('Cohort/Feature_Extraction/Unsupervised_ALL_HF/VitalSign_after_onset_HF_ALL_mmm_0_6_cleaned')

In [None]:
vital_df=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/VitalSign_after_onset_HF_ALL_mmm_0_6_cleaned').to_pandas()
vital_df

# Check for NaNs in row: 


In [None]:
vital_df_clean=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF//VitalSign_after_onset_HF_ALL_mmm_0_6_cleaned').to_pandas()
vital_df_clean

In [None]:
d=vital_df_clean.isnull().sum(axis=1)
plt.hist(x=d, bins=25, color='#0504aa',alpha=0.7, rwidth=0.85)

In [None]:
d

# get missing values for Vital and Lab together: 


In [None]:
#vital_df_clean
#lab_df_clean
df_vital_lab_clean=vital_df_clean.merge(lab_df_clean, left_on='medical_record_number',right_on='medical_record_number',how='inner')
df_vital_lab_clean

In [None]:
d=df_vital_lab_clean.isnull().sum(axis=1)
plt.hist(x=d, bins=162, color='#0504aa',alpha=0.7, rwidth=0.85)

In [None]:
np.count_nonzero(d>=136)

# Procedure

In [None]:
proc_df=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Procedure_after_onset_HF_ALL_mmm_0_8').to_pandas()

In [None]:
proc_df

In [None]:
further_col_drop=[]
for c in proc_df.columns: 
    if ('EPIC' not in c) : 
        print(c)
        further_col_drop.append(c)
further_col_drop

In [None]:
further_col_drop=[
'age_in_days',
 'date_of_birth',
 'month_of_birth',
 'gender',
 'religion',
 'race',
 'patient_ethnic_group',
 'deceased_indicator',
 'mother_account_number',
 'address_zip',
 'marital_status_code',
 'Procedure__IBEX__(RESP)',
 'Procedure__IBEX__(T-O)',
 'Procedure__IBEX__(T-T)',
 'Procedure__IBEX__DBP',
 'Procedure__IBEX__PULSE',
 'Procedure__IBEX__PULSE OXIMETRY',
 'Procedure__IBEX__SBP',
 'Procedure__IBEX__WEIGHT',
 'Procedure__SYSTEM__MSDW_NOT APPLICABLE',
 'Procedure__SYSTEM__MSDW_UNKNOWN',
 'Procedure__TDS__(RESP)',
 'Procedure__TDS__(T-O)',
 'Procedure__TDS__(T-T)',
 'Procedure__TDS__DBP',
 'Procedure__TDS__HEIGHT',
 'Procedure__TDS__SBP',
 'Procedure__TDS__WEIGHT']

In [None]:
proc_df=proc_df.drop(further_col_drop,axis=1)

In [None]:
proc_df

In [None]:
#4__93000: Electrocardiogram, routine ecg with at least 12 leads; with interpretation and report (time series) 
#Procedure__EPIC__7509: Electrocardiogram, complete (time series) 
#Procedure__MUSE ID__93000 :Electrocardiogram, complete (time series) | MUSE ID

In [None]:
proc_df.to_parquet('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Procedure_after_onset_HF_ALL_mmm_0_8_cleaned')

# Diagnosis

In [None]:
diag_df=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Diagnosis_after_onset_HF_ALL_mmm_0_2').to_pandas()

In [None]:
diag_df

In [None]:
for c in diag_df.columns: 
    print(c)

In [None]:
further_col_drop=[
'Diagnosis__EPIC__COORDINATION OF CARE',
'Diagnosis__EPIC__ERRONEOUS ENCOUNTER-DISREGARD',
'Diagnosis__EPIC__ESTABLISHED PATIENT',
'Diagnosis__EPIC__FOLLOW UP',
'Diagnosis__EPIC__FOLLOW-UP',
'Diagnosis__EPIC__MEDICATION REFILL',
'Diagnosis__EPIC__NEW PATIENT',
'Diagnosis__EPIC__PROCEDURE ONLY',
'Diagnosis__EPIC__REFILL REQUEST',
'Diagnosis__EPIC__SHORTNESS OF BREATH',
'Diagnosis__EPIC__SPEAK WITH PROVIDER',
'Diagnosis__EPIC__TEST RESULTS',
'Diagnosis__SYSTEM__MSDW_NOT APPLICABLE',
'Diagnosis__SYSTEM__MSDW_UNKNOWN']

In [None]:
further_col_drop

In [None]:
diag_df=diag_df.drop(further_col_drop,axis=1)

In [None]:
diag_df

In [None]:
diag_df.to_parquet('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Diagnosis_after_onset_HF_ALL_mmm_0_2_cleaned')

In [None]:
diag_df=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Diagnosis_after_onset_HF_ALL_mmm_0_4_cleaned').to_pandas()
diag_df

# Drug

In [None]:
drug_df=pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Drug_after_onset_HF_ALL_mmm_0_2').to_pandas()

In [None]:
drug_df

In [None]:
for c in drug_df.columns: 
    print(c)