In [None]:
from google.cloud import bigquery
import os
import pandas as pd
import sys
import yaml

### Set environment variables to point to your GCP BQ credentials, and set your google project
For ease you can put this in your .bash_profile, or have it automatically set whenever you activate your [healthrex_ml environment](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#macos-and-linux)

In [None]:
# Edit to point to yours
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = (
    '/Users/conorcorbin/.config/gcloud/application_default_credentials.json'
)
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101'

# Instantiate a client object so you can make queries
client = bigquery.Client()

### Define an experiment and run

In [None]:
suid='' # your sunet id
EXPERIMENT_NAME = f"20230118_workshop_{suid}"
RUN_NAME = "baseline_cbc_models"

### Define a cohort, or use an existing one in the cohorts module
When called, a `Cohort` object will execute a query and create cohort table in bigquery `working_project_id.dataset_name.table_name` with the following columns
1. anon_id : id of the patient 
2. observation_id : id of the ML example (observation)
3. index_time : timestamp indicating when the prediction is to be made (never use features that exist after index_time)
4. label_1 : label
5. label_N : label
In example below we will have N=4 binary labels -- the task is `multilabel` not `multiclass`, in that more than one of the binary labels can take value 1 in the same example. 

We'll use the `CBCWithDifferentialCohort`, which you can find [here](https://github.com/HealthRex/healthrex_ml/blob/main/healthrex_ml/cohorts/starr_cohorts.py#L479) 

In [None]:
from healthrex_ml.cohorts import CBCWithDifferentialCohort 
cohort = CBCWithDifferentialCohort(
    client=client,
    dataset_name='devworkshop',
    working_project_id='mining-clinical-decisions',
    table_name=f"{EXPERIMENT_NAME}_{RUN_NAME}_cohort"
)
cohort()

### Define a set of extractors

Extractor definitions [here]()

In [None]:
from healthrex_ml.extractors import (
    AgeExtractor,
    RaceExtractor,
    SexExtractor,
    EthnicityExtractor,
    ProcedureExtractor,
    PatientProblemExtractor,
    MedicationExtractor,
    LabOrderExtractor,
    LabResultBinsExtractor,
    FlowsheetBinsExtractor
)

USED_EXTRACTORS = [AgeExtractor,
    RaceExtractor,
    SexExtractor,
    EthnicityExtractor,
    ProcedureExtractor,
    PatientProblemExtractor,
    MedicationExtractor,
    LabOrderExtractor,
    LabResultBinsExtractor,
    FlowsheetBinsExtractor
]

cohort_table=f"{cohort.project_id}.{cohort.dataset_name}.{cohort.table_name}"
feature_table=f"{cohort.project_id}.{cohort.dataset_name}.{RUN_NAME}_feature_matrix"
extractors = [
    ext(cohort_table_id=cohort_table, feature_table_id=feature_table)
    for ext in USED_EXTRACTORS
]

### Define a featurizer and create a feature matrix

Will execute a series of SQL queries defined by the extractors to build up a long form feature matrix and save to bigquery. Additionally, will read in the long form feature matrix and build up a sparse (CSR) matrix without doing the expensive pivot operation.  Will save locally. Automatically generates train/test split by using last year of data as test set.  Can use `train_years` and `test_years` arguments in the `__init__` function to modify. 

Implementatin of [BagOfWordsFeaturizer](https://github.com/HealthRex/healthrex_ml/blob/main/healthrex_ml/featurizers/starr_featurizers.py#L239)

In [None]:
from healthrex_ml.featurizers import BagOfWordsFeaturizer

featurizer = BagOfWordsFeaturizer(
        cohort_table_id=cohort_table,
        feature_table_id=feature_table,
        extractors=extractors,
        outpath=f"./{RUN_NAME}_artifacts",
        tfidf=True
)
featurizer()

### Train a set of gradient boosted trees

Implementation of [LightGBMTrainer](https://github.com/HealthRex/healthrex_ml/blob/main/healthrex_ml/trainers/sklearn_trainers.py#L23)

In [None]:
from healthrex_ml.trainers import LightGBMTrainer

trainer = LightGBMTrainer(working_dir=f"./{RUN_NAME}_artifacts")
tasks = ['label_PLT', 'label_HCT', 'label_WBC', 'label_HGB']

for task in tasks:
    trainer(task)

### Evaluate model performance on test set and dump 

Implementation of [BinaryEvaluator](https://github.com/HealthRex/healthrex_ml/blob/main/healthrex_ml/evaluators/evaluators.py#L21) 

In [None]:
from healthrex_ml.evaluators import BinaryEvaluator
from tqdm import tqdm

for task in tqdm(tasks):
    evalr = BinaryEvaluator(
        outdir=f"./{RUN_NAME}_artifacts/{task}_performance_artificats/",
        task_name=task
    )
    df_yhats = pd.read_csv(os.path.join(trainer.working_dir, f"{task}_yhats.csv"))
    evalr(df_yhats.labels, df_yhats.predictions)