In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))

In [3]:
import pandas as pd
import warnings

import json
from numpy import random
from dataclasses import dataclass

from model.utils import report_results
from model.train import train_classifier

from sklearn.ensemble import HistGradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)
warnings.filterwarnings("ignore")

  seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)


In [4]:
@dataclass
class RunConfiguration:
  run_grid_search: bool
  grid_search_params: dict
  default_parameters: dict

In [None]:
def get_parameters(df: pd.DataFrame, model_factory, run_config: RunConfiguration):
  if not run_config.run_grid_search:
    return run_config.default_parameters
  
  response = train_classifier(model_factory(), target="subtype", data=df, grid_search_params=run_config.grid_search_params)
  parameters = {k: response.model.get_params()[k] for k in response.model.get_params().keys() & run_config.grid_search_params.keys() }
  print(parameters)
  return parameters

def run_tests(model_factory, category: str, not_biased_config: RunConfiguration):
  data = pd.read_csv(f"../../preprocessed/{category}/genes.csv").drop(columns=["sample_id"])
  print(report_results(data, model_factory, get_parameters(data, model_factory, not_biased_config), seed_list).report)

In [9]:
run_tests(
  category="min_tpm_5",
  model_factory=HistGradientBoostingClassifier,
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    grid_search_params={"learning_rate": (0.05, 0.1, 0.5, 1), "max_features": (0.1, 0.2, 0.5, "sqrt"), "l2_regularization": (0, 0.5, 1), "max_depth": (16, 32, 64, None)},
    default_parameters={'learning_rate': 0.1, 'max_depth': 32, 'max_features': 0.1, 'l2_regularization': 0.5}
  )
)

100%|██████████| 100/100 [1:10:25<00:00, 42.25s/it]

     Metric          Overall             Male           Female
0        F1  0.8871 ± 0.0253  0.8780 ± 0.0369  0.8867 ± 0.0322
1    Recall  0.8889 ± 0.0231  0.8831 ± 0.0330  0.8955 ± 0.0279
2   ROC AUC  0.9908 ± 0.0059  0.9907 ± 0.0060  0.9918 ± 0.0086
3  Accuracy  0.8889 ± 0.0231  0.8831 ± 0.0330  0.8955 ± 0.0279





In [None]:
run_tests(
  category="min_tpm_5",
  model_factory=ExtraTreesClassifier,
  not_biased_config=RunConfiguration(
    run_grid_search=True,
    grid_search_params={"n_estimators": (8, 16, 32, 64), "max_features": (0.2, 0.5, 1, "sqrt"), "max_depth": (16, 32, 64, None)},
    default_parameters={'n_estimators': 32, 'max_depth': 16, 'max_features': 0.5}
  )
)

{'n_estimators': 32, 'max_depth': 16, 'max_features': 0.5}


100%|██████████| 100/100 [08:54<00:00,  5.35s/it]

     Metric          Overall             Male           Female
0        F1  0.8569 ± 0.0242  0.8365 ± 0.0353  0.8637 ± 0.0345
1    Recall  0.8681 ± 0.0221  0.8442 ± 0.0321  0.8806 ± 0.0288
2   ROC AUC  0.9754 ± 0.0093  0.9747 ± 0.0139  0.9812 ± 0.0135
3  Accuracy  0.8681 ± 0.0221  0.8442 ± 0.0321  0.8806 ± 0.0288





In [None]:
run_tests(
  category="min_tpm_5",
  model_factory=RandomForestClassifier,
  not_biased_config=RunConfiguration(
    run_grid_search=True,
    grid_search_params={"n_estimators": (32, 64, 128), "max_features": (0.2, 0.5, "sqrt"), "max_depth": (None, 32)},
    default_parameters={'n_estimators': 64, 'max_depth': None, 'max_features': 0.2}
  )
)

{'n_estimators': 32, 'max_depth': 32, 'max_features': 0.5}


100%|██████████| 100/100 [32:45<00:00, 19.66s/it]

     Metric          Overall             Male           Female
0        F1  0.8502 ± 0.0281  0.8395 ± 0.0368  0.8476 ± 0.0337
1    Recall  0.8576 ± 0.0254  0.8571 ± 0.0327  0.8657 ± 0.0303
2   ROC AUC  0.9745 ± 0.0102  0.9717 ± 0.0142  0.9802 ± 0.0130
3  Accuracy  0.8576 ± 0.0254  0.8571 ± 0.0327  0.8657 ± 0.0303





In [56]:
run_tests(
  category="min_tpm_5",
  model_factory=lambda **kwargs: AdaBoostClassifier(ExtraTreesClassifier(n_estimators=32, max_depth=16, max_features=0.5), **kwargs),
  not_biased_config=RunConfiguration(
    run_grid_search=True,
    grid_search_params={"n_estimators": (8, 16, 32, 64), "learning_rate": (0.1, 0.5, 1, 2)},
    default_parameters = {
    }
  )
)

{'n_estimators': 16, 'learning_rate': 0.1}


100%|██████████| 100/100 [08:31<00:00,  5.12s/it]

     Metric          Overall             Male           Female
0        F1  0.8535 ± 0.0259  0.8398 ± 0.0336  0.8631 ± 0.0356
1    Recall  0.8611 ± 0.0243  0.8442 ± 0.0317  0.8806 ± 0.0318
2   ROC AUC  0.8879 ± 0.0176  0.8808 ± 0.0227  0.8965 ± 0.0243
3  Accuracy  0.8611 ± 0.0243  0.8442 ± 0.0317  0.8806 ± 0.0318





In [57]:
run_tests(
  category="min_tpm_5",
  model_factory=lambda **kwargs: AdaBoostClassifier(RandomForestClassifier(n_estimators=32, max_depth=32, max_features=0.5), **kwargs),
  not_biased_config=RunConfiguration(
    run_grid_search=True,
    grid_search_params={"n_estimators": (8, 16, 32, 64), "learning_rate": (0.1, 0.5, 1, 2)},
    default_parameters = {
    }
  )
)

{'n_estimators': 64, 'learning_rate': 2}


100%|██████████| 100/100 [36:40<00:00, 22.00s/it]

     Metric          Overall             Male           Female
0        F1  0.8392 ± 0.0226  0.8294 ± 0.0322  0.8450 ± 0.0316
1    Recall  0.8472 ± 0.0211  0.8442 ± 0.0289  0.8657 ± 0.0279
2   ROC AUC  0.8822 ± 0.0165  0.8750 ± 0.0228  0.8850 ± 0.0236
3  Accuracy  0.8472 ± 0.0211  0.8442 ± 0.0289  0.8657 ± 0.0279



