# Unsupervised Features extraction Pipeline 
This Notebook is extraction features for a specific cohort for different dimensions like Diagnosis or Medications.
The User can adapt the: 
- Threshold(How many Patients should have this condition)
- window(the timeframe that should be considered)
- aggfunc (any --> yes or no ; count --> occurence of the concept; numericvalue getting min median and max )

In [None]:
import fiber
from fiber.cohort import Cohort
from fiber.condition import Patient, MRNs
from fiber.condition import Diagnosis
from fiber.condition import Measurement, Encounter, Drug, TobaccoUse, VitalSign, LabValue, Procedure
from fiber.storage import yaml as fiberyaml
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import os
from functools import reduce 
from fiber.utils import Timer
import math

In [None]:
# Unsupervised Feature extraction - PIVOT config#

In [None]:
# CONFIG Diagnosis
DEFAULT_PIVOT_CONFIG = { Diagnosis(): {
'window':(-730, +math.inf), 'pivot_table_kwargs': {
'columns': ['description'],
'aggfunc': {'description': 'any'} }, 'threshold': 0.9
    }
} 

In [None]:
DEFAULT_PIVOT_CONFIG = { Procedure(): {
'window':(-730, +math.inf), 'pivot_table_kwargs': {
'columns': ['description'],
'aggfunc': {'description' : 'any'} }, 'threshold': 0.6
    }
}

In [None]:
# CONFIG Drugs
DEFAULT_PIVOT_CONFIG = { Drug(): {
'window':(-730, +math.inf),'pivot_table_kwargs': {
'columns': ['description'],
'aggfunc': {'description': 'any' }}, 'threshold': 0.2
    }
} 

In [None]:
# CONFIG VitalSigns
DEFAULT_PIVOT_CONFIG = { VitalSign(): {
'window':(-730, +math.inf),'pivot_table_kwargs': {
'columns': ['description'],
'aggfunc': {'numeric_value': ['min', 'median', 'max']}},'threshold': 0.8
    }
} 

In [None]:
# CONFIG LabValue
DEFAULT_PIVOT_CONFIG = { LabValue(): {
'window':(-730, +math.inf), 'pivot_table_kwargs': {
'columns': ['description'],
'aggfunc': {'numeric_value': ['min', 'median', 'max']} }, 'threshold': 0.8
    }
} 

In [None]:
# Reading cohort as dataframe
Case_EF_ICD = pq.read_table('Cohort/Phenotyping/ALL_Matches_1yr_HF_EF_ICD_Notes_Cohort.parquet').to_pandas()
Case_EF_ICD=Case_EF_ICD.set_index('MRN', inplace=False)
Case_ICD = pq.read_table('Cohort/Phenotyping/ALL_Matches_1yr_HF_ICD_Notes_Cohort.parquet').to_pandas()
Case_ICD=Case_ICD.set_index('MRN', inplace=False)

In [None]:
Case_all= pd.concat([Case_EF_ICD, Case_ICD], ignore_index=False, sort =False)

In [None]:
Case_all

In [None]:
# Saving HF_Onset as age_in_days for PIVOT config necessary
Case = Case_all["HF_Onset_age_in_days"]
Case = Case.to_frame()
Case.reset_index(level=0, inplace=True)
Case.rename(columns = {"HF_Onset_age_in_days": "age_in_days"}, inplace = True)
Case.rename(columns = {"MRN": "medical_record_number"}, inplace = True)

In [None]:
#case = Case.sample(50) #to try with small sample
Case

In [None]:
# running through entire cohort in batches and saving each batch as parquet
#5000 ?
#for limit in range (0, len(Case), 5000):
print("Begin of iteration: " )

#    temp = Case[limit:(limit+5000)]
p_condition = MRNs(Case) #how to create cohort from dataframe
cohort = Cohort(p_condition)
result = cohort.get_pivoted_features(pivot_config=DEFAULT_PIVOT_CONFIG)
#result.to_parquet('Cohort/Feature_Extraction/Unsupervised_ALL_HF/VitalSign_after_onset_HF_ALL_mmm_0_6' + str(limit))
#result.to_parquet('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Procedure_after_onset_HF_ALL_mmm_0_6')

In [None]:
#checkig output of pipeline 
test= pq.read_table('Cohort/Feature_Extraction/Unsupervised_ALL_HF/Procedure_after_onset_HF_ALL_mmm_0_6').to_pandas()

In [None]:
for n in test.columns: 
    print(n)

In [None]:
#default 12 col
#Drugs:67
#vital:60
#Diagnosis: 74
#Procedures: 391
#LabValues 621
result