In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))

In [3]:
import pandas as pd
import warnings

import json
from numpy import random
from dataclasses import dataclass

from model.utils import report_results
from model.gradient_boosting import train_gradient_boosting

from sklearn.ensemble import HistGradientBoostingClassifier

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)
warnings.filterwarnings("ignore")

search_params = {"learning_rate": (0.05, 0.1, 0.5, 1), "max_features": (0.1, 0.2, 0.5, "sqrt"), "l2_regularization": (0, 0.5, 1), "max_depth": (16, 32, 64, None)}

  seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)


In [4]:
@dataclass
class RunConfiguration:
  run_grid_search: bool
  default_parameters: dict

In [13]:
def get_parameters(df: pd.DataFrame, run_config: RunConfiguration):
  if not run_config.run_grid_search:
    return run_config.default_parameters
  
  grid_search_response = train_gradient_boosting(target="subtype", data=df, grid_search_params=search_params)
  parameters = {k: grid_search_response.model.get_params()[k] for k in grid_search_response.model.get_params().keys() & search_params.keys() }
  print(parameters)
  return parameters

def run_tests(category: str, method: str, metric: str, run_config: RunConfiguration):
  data = pd.read_csv(f"../../preprocessed/{category}/genes.csv").drop(columns=["sample_id"])
  pvalues = json.loads(open(f"../../preprocessed/{category}/important_genes_{method}_{metric}.json").readline())

  chosen_genes = list(set([y["gene"] for x in [sex_values[:15] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x]))
  print(f"Total chosen genes: {len(chosen_genes)}")

  df = data[["subtype", "sex", *chosen_genes]]
  print(report_results(df, HistGradientBoostingClassifier, get_parameters(df, run_config), seed_list).report)

In [14]:
run_tests(
  category="min_tpm_5",
  method="random_forest",
  metric="recall",
  run_config=RunConfiguration(
    run_grid_search=True,
    default_parameters={'learning_rate': 0.1, 'max_depth': 32, 'max_features': 0.1, 'l2_regularization': 0.5}
  )
)

Total chosen genes: 242
{'l2_regularization': 1, 'max_depth': None, 'max_features': 0.1, 'learning_rate': 0.05}


100%|██████████| 100/100 [09:29<00:00,  5.69s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8949 ± 0.0237  0.8990 ± 0.0278  0.8864 ± 0.0348
1      F1 (Macro)  0.8698 ± 0.0315  0.8733 ± 0.0389  0.8510 ± 0.0481
2  Recall (Macro)  0.8548 ± 0.0318  0.8599 ± 0.0399  0.8379 ± 0.0465
3         ROC AUC  0.9919 ± 0.0041  0.9932 ± 0.0048  0.9910 ± 0.0065
4        Accuracy  0.8993 ± 0.0226  0.9091 ± 0.0260  0.8955 ± 0.0327
5        Duration  4.5552 ± 0.7727                0                0





In [15]:
run_tests(
  category="min_tpm_5",
  method="random_forest",
  metric="f1",
  run_config=RunConfiguration(
    run_grid_search=True,
    default_parameters={'learning_rate': 0.1, 'max_depth': 32, 'max_features': 0.1, 'l2_regularization': 0.5}
  )
)

Total chosen genes: 242
{'l2_regularization': 0, 'max_depth': 32, 'max_features': 0.2, 'learning_rate': 0.1}


100%|██████████| 100/100 [13:01<00:00,  7.81s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.9000 ± 0.0222  0.8980 ± 0.0281  0.8945 ± 0.0323
1      F1 (Macro)  0.8721 ± 0.0290  0.8764 ± 0.0436  0.8631 ± 0.0429
2  Recall (Macro)  0.8634 ± 0.0300  0.8625 ± 0.0429  0.8494 ± 0.0436
3         ROC AUC  0.9909 ± 0.0042  0.9920 ± 0.0059  0.9902 ± 0.0066
4        Accuracy  0.9028 ± 0.0215  0.9091 ± 0.0259  0.8955 ± 0.0312
5        Duration  7.1250 ± 0.3398                0                0





In [16]:
run_tests(
  category="min_tpm_5",
  method="logistic",
  metric="recall",
  run_config=RunConfiguration(
    run_grid_search=True,
    default_parameters={'learning_rate': 0.1, 'max_depth': 32, 'max_features': 0.1, 'l2_regularization': 0.5}
  )
)

Total chosen genes: 252
{'l2_regularization': 0, 'max_depth': 64, 'max_features': 0.1, 'learning_rate': 0.1}


100%|██████████| 100/100 [03:37<00:00,  2.17s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.9064 ± 0.0231  0.8990 ± 0.0328  0.9073 ± 0.0286
1      F1 (Macro)  0.8790 ± 0.0339  0.8741 ± 0.0484  0.8778 ± 0.0412
2  Recall (Macro)  0.8642 ± 0.0337  0.8656 ± 0.0470  0.8694 ± 0.0393
3         ROC AUC  0.9918 ± 0.0048  0.9920 ± 0.0044  0.9932 ± 0.0073
4        Accuracy  0.9097 ± 0.0212  0.9091 ± 0.0301  0.9104 ± 0.0262
5        Duration  2.0205 ± 0.1149                0                0





In [None]:
run_tests(
  category="min_tpm_5",
  method="logistic",
  metric="f1",
  run_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'l2_regularization': 0, 'max_depth': None, 'max_features': 0.2, 'learning_rate': 0.1}
  )
)

Total chosen genes: 252
{'l2_regularization': 0, 'max_depth': None, 'max_features': 0.2, 'learning_rate': 0.1}


100%|██████████| 100/100 [03:32<00:00,  2.12s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.9009 ± 0.0222  0.9005 ± 0.0347  0.8991 ± 0.0270
1      F1 (Macro)  0.8709 ± 0.0305  0.8718 ± 0.0490  0.8639 ± 0.0386
2  Recall (Macro)  0.8608 ± 0.0314  0.8595 ± 0.0472  0.8605 ± 0.0382
3         ROC AUC  0.9910 ± 0.0053  0.9923 ± 0.0052  0.9923 ± 0.0079
4        Accuracy  0.9028 ± 0.0209  0.9091 ± 0.0318  0.9104 ± 0.0252
5        Duration  1.9915 ± 0.0695                0                0



