In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))

In [14]:
import pandas as pd
import warnings

import json
from numpy import random
from dataclasses import dataclass

from model.utils import report_results
from model.gradient_boosting import train_gradient_boosting

from sklearn.ensemble import HistGradientBoostingClassifier

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
seed_list = random.random_integers(low=0, high=2**32 - 1, size=10)
warnings.filterwarnings("ignore")

search_params = {"learning_rate": (0.1, 0.5, 1), "max_features": (0.1, 0.2, 0.5, "sqrt"), "l2_regularization": (0, 0.5, 1), "max_depth": (16, 32, 64, None)}

In [6]:
@dataclass
class RunConfiguration:
  run_grid_search: bool
  default_parameters: dict
  polynomial_degree: int

In [10]:
def get_parameters(df: pd.DataFrame, run_config: RunConfiguration):
  if not run_config.run_grid_search:
    return run_config.default_parameters
  
  grid_search_response = train_gradient_boosting(target="subtype", data=df, grid_search_params=search_params, polynomial_degree=run_config.polynomial_degree)
  parameters = {k: grid_search_response.model.get_params()[k] for k in grid_search_response.model.get_params().keys() & search_params.keys() }
  print(parameters)
  return parameters

def run_tests(category: str, genes_per_subtype: int, run_config: RunConfiguration):
  data = pd.read_csv(f"../../preprocessed/{category}/genes.csv").drop(columns=["sample_id"])
  pvalues = json.loads(open(f"../../preprocessed/{category}/important_genes_logistic_recall.json").readline())

  chosen_genes = list(set([y["gene"] for x in [sex_values[:genes_per_subtype] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x]))
  print(f"Total chosen genes: {len(chosen_genes)}")

  df = data[["subtype", "sex", *chosen_genes]]
  print(report_results(df, HistGradientBoostingClassifier, get_parameters(df, run_config), seed_list, polynomial_degree=run_config.polynomial_degree).report)

In [None]:
run_tests(
  category="min_tpm_5",
  genes_per_subtype=12,
  run_config=RunConfiguration(
    run_grid_search=False,
    polynomial_degree=2,
    default_parameters={'max_depth': 32, 'learning_rate': 0.1, 'max_features': 0.1, 'l2_regularization': 0}
  )
)

Total chosen genes: 204


100%|██████████| 100/100 [5:36:36<00:00, 201.96s/it]  

           Metric              Overall             Male           Female
0   F1 (Weighted)      0.9007 ± 0.0222  0.8886 ± 0.0319  0.9074 ± 0.0317
1      F1 (Macro)      0.8715 ± 0.0325  0.8616 ± 0.0458  0.8782 ± 0.0459
2  Recall (Macro)      0.8616 ± 0.0330  0.8502 ± 0.0448  0.8736 ± 0.0447
3         ROC AUC      0.9903 ± 0.0054  0.9902 ± 0.0057  0.9928 ± 0.0081
4        Accuracy      0.9028 ± 0.0210  0.8961 ± 0.0293  0.9104 ± 0.0303
5        Duration  120.8709 ± 318.5661                0                0





In [15]:
run_tests(
  category="min_tpm_5",
  genes_per_subtype=5,
  run_config=RunConfiguration(
    run_grid_search=False,
    polynomial_degree=3,
    default_parameters={'max_depth': 32, 'learning_rate': 0.1, 'max_features': 0.1, 'l2_regularization': 0}
  )
)

Total chosen genes: 81


  0%|          | 0/10 [05:19<?, ?it/s]


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].