In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))

In [4]:
import pandas as pd
import warnings

import json
from numpy import random
from dataclasses import dataclass

from model.utils import report_results
from model.gradient_boosting import train_gradient_boosting

from sklearn.ensemble import HistGradientBoostingClassifier

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)
warnings.filterwarnings("ignore")

search_params = {"learning_rate": (0.05, 0.1, 0.5, 1), "max_features": (0.1, 0.2, 0.5, "sqrt"), "l2_regularization": (0, 0.5, 1), "max_depth": (16, 32, 64, None), "class_weight": ("balanced",)}

  seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)


In [5]:
@dataclass
class RunConfiguration:
  run_grid_search: bool
  default_parameters: dict

In [6]:
def get_parameters(df: pd.DataFrame, run_config: RunConfiguration):
  if not run_config.run_grid_search:
    return run_config.default_parameters
  
  grid_search_response = train_gradient_boosting(target="subtype", data=df, grid_search_params=search_params)
  parameters = {k: grid_search_response.model.get_params()[k] for k in grid_search_response.model.get_params().keys() & search_params.keys() }
  print(parameters)
  return parameters

def run_tests(category: str, method: str, metric: str, run_config: RunConfiguration):
  data = pd.read_csv(f"../../preprocessed/{category}/genes.csv").drop(columns=["sample_id"])
  pvalues = json.loads(open(f"../../preprocessed/{category}/important_genes_{method}_{metric}.json").readline())

  chosen_genes = list(set([y["gene"] for x in [sex_values[:15] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x]))
  print(f"Total chosen genes: {len(chosen_genes)}")

  df = data[["subtype", "sex", *chosen_genes]]
  print(report_results(df, HistGradientBoostingClassifier, get_parameters(df, run_config), seed_list).report)

In [9]:
run_tests(
  category="min_tpm_5",
  method="random_forest",
  metric="recall",
  run_config=RunConfiguration(
    run_grid_search=True,
    default_parameters={'learning_rate': 0.1, 'max_depth': 32, 'max_features': 0.1, 'l2_regularization': 0.5}
  )
)

Total chosen genes: 242
{'class_weight': 'balanced', 'l2_regularization': 1, 'max_depth': 16, 'max_features': 0.1, 'learning_rate': 0.1}


100%|██████████| 100/100 [01:52<00:00,  1.13s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.9059 ± 0.0228  0.9077 ± 0.0293  0.8950 ± 0.0311
1      F1 (Macro)  0.8829 ± 0.0306  0.8806 ± 0.0418  0.8664 ± 0.0431
2  Recall (Macro)  0.8762 ± 0.0317  0.8785 ± 0.0432  0.8583 ± 0.0427
3         ROC AUC  0.9921 ± 0.0041  0.9931 ± 0.0050  0.9911 ± 0.0064
4        Accuracy  0.9062 ± 0.0223  0.9091 ± 0.0279  0.8955 ± 0.0303
5        Duration  1.0009 ± 0.0284                0                0





In [10]:
run_tests(
  category="min_tpm_5",
  method="random_forest",
  metric="f1",
  run_config=RunConfiguration(
    run_grid_search=True,
    default_parameters={'learning_rate': 0.1, 'max_depth': 32, 'max_features': 0.1, 'l2_regularization': 0.5}
  )
)

Total chosen genes: 242
{'class_weight': 'balanced', 'l2_regularization': 1, 'max_depth': 64, 'max_features': 0.2, 'learning_rate': 0.05}


100%|██████████| 100/100 [01:50<00:00,  1.10s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8959 ± 0.0232  0.9041 ± 0.0283  0.8850 ± 0.0357
1      F1 (Macro)  0.8706 ± 0.0308  0.8806 ± 0.0392  0.8541 ± 0.0475
2  Recall (Macro)  0.8660 ± 0.0313  0.8770 ± 0.0395  0.8528 ± 0.0475
3         ROC AUC  0.9909 ± 0.0044  0.9921 ± 0.0051  0.9894 ± 0.0070
4        Accuracy  0.8958 ± 0.0228  0.9091 ± 0.0266  0.8955 ± 0.0355
5        Duration  0.9670 ± 0.0673                0                0





In [8]:
run_tests(
  category="min_tpm_5",
  method="logistic",
  metric="recall",
  run_config=RunConfiguration(
    run_grid_search=True,
    default_parameters={'l2_regularization': 0, 'max_depth': 64, 'max_features': 0.1, 'learning_rate': 0.1}
  )
)

Total chosen genes: 252
{'class_weight': 'balanced', 'l2_regularization': 0.5, 'max_depth': 32, 'max_features': 0.2, 'learning_rate': 0.1}


100%|██████████| 100/100 [01:54<00:00,  1.15s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8989 ± 0.0210  0.8960 ± 0.0311  0.9049 ± 0.0302
1      F1 (Macro)  0.8718 ± 0.0301  0.8731 ± 0.0425  0.8697 ± 0.0430
2  Recall (Macro)  0.8624 ± 0.0312  0.8618 ± 0.0428  0.8643 ± 0.0420
3         ROC AUC  0.9916 ± 0.0047  0.9921 ± 0.0043  0.9927 ± 0.0076
4        Accuracy  0.9028 ± 0.0200  0.8961 ± 0.0294  0.9104 ± 0.0287
5        Duration  0.9898 ± 0.1093                0                0





In [None]:
run_tests(
  category="min_tpm_5",
  method="logistic",
  metric="f1",
  run_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'class_weight': 'balanced', 'l2_regularization': 0.5, 'max_depth': 16, 'max_features': 0.1, 'learning_rate': 0.05}
  )
)

Total chosen genes: 252
{'class_weight': 'balanced', 'l2_regularization': 0.5, 'max_depth': 16, 'max_features': 0.1, 'learning_rate': 0.05}


100%|██████████| 100/100 [02:12<00:00,  1.33s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.9056 ± 0.0206  0.9050 ± 0.0303  0.9077 ± 0.0282
1      F1 (Macro)  0.8806 ± 0.0298  0.8814 ± 0.0426  0.8771 ± 0.0411
2  Recall (Macro)  0.8727 ± 0.0310  0.8729 ± 0.0433  0.8785 ± 0.0407
3         ROC AUC  0.9930 ± 0.0047  0.9933 ± 0.0038  0.9932 ± 0.0074
4        Accuracy  0.9097 ± 0.0197  0.9091 ± 0.0286  0.9104 ± 0.0267
5        Duration  1.1511 ± 0.1265                0                0



