In [24]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))

In [26]:
import pandas as pd
import warnings

import json
from numpy import random
from dataclasses import dataclass

from model.utils import report_results
from model.gradient_boosting import train_gradient_boosting

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)
warnings.filterwarnings("ignore")

search_params = {"learning_rate": (0.05, 0.1, 0.5, 1), "max_features": (0.05, 0.1, 0.15, 0.2, 0.5,), "min_samples_leaf": (5, 10, 15, 20), "l2_regularization": (0, 0.5, 1), "max_depth": (16, 32, 64, 128, 256)}

In [27]:
@dataclass
class RunConfiguration:
  run_grid_search: bool
  default_parameters: dict

In [28]:
def get_parameters(df: pd.DataFrame, run_config: RunConfiguration):
  if not run_config.run_grid_search:
    return run_config.default_parameters
  
  grid_search_response = train_gradient_boosting(target="subtype", data=df, grid_search_params=search_params)
  parameters = {k: grid_search_response.model.get_params()[k] for k in grid_search_response.model.get_params().keys() & search_params.keys() }
  print(parameters)
  return parameters

def run_tests(category: str, not_biased_config: RunConfiguration, biased_config: RunConfiguration):
  data = pd.read_csv(f"../../preprocessed/{category}/genes.csv").drop(columns=["sample_id"])
  pvalues = json.loads(open(f"../../preprocessed/{category}/important_genes_pvalue.json").readline())

  chosen_genes_all = list(set([y["gene"] for x in [sex_values[:25] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x]))
  chosen_genes_male = list(set([y["gene"] for x in [subtype_items["Male"][:50] for subtype_items in pvalues.values()] for y in x]))
  print(f"Total chosen genes: {len(chosen_genes_all)}")
  print(f"Total chosen genes (biased): {len(chosen_genes_male)}")

  print("Reporting not biased results")
  df = data[["subtype", "sex", *chosen_genes_all]]
  report_results(df, get_parameters(df, not_biased_config), seed_list, is_biased=False)

  print("Reporting biased results")
  df = data[["subtype", "sex", *chosen_genes_male]]
  report_results(df, get_parameters(df[df["sex"] == "Male"], biased_config), seed_list, is_biased=True)

In [None]:
run_tests(
  category="min_tpm_0",
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters = { "max_depth": 64, "learning_rate": 0.1, "l2_regularization": 0, "min_samples_leaf": 20, "max_features": 0.1 }
  ),
  biased_config=RunConfiguration(
    run_grid_search=True,
    default_parameters = { "max_depth": 32, "learning_rate": 0.1, "l2_regularization": 0, "min_samples_leaf": 15, "max_features": 0.05 }
  )
)

# Total chosen genes: 402
# Total chosen genes (biased): 455
# Reporting not biased results
# 100%|██████████| 100/100 [09:07<00:00,  5.48s/it]
# F1: 0.8735 ± 0.0256
# Male F1: 0.8517 ± 0.0372
# Female F1: 0.8952 ± 0.0369
# Reporting biased results

# Best params: {'categorical_features': 'from_dtype', 'class_weight': None, 'early_stopping': 'auto', 'interaction_cst': None, 'l2_regularization': 0, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_bins': 255, 'max_depth': 64, 'max_features': 0.05, 'max_iter': 100, 'max_leaf_nodes': 31, 'min_samples_leaf': 5, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': None, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
# {'max_depth': 64, 'learning_rate': 0.1, 'l2_regularization': 0, 'min_samples_leaf': 5, 'max_features': 0.05}
# 100%|██████████| 100/100 [3:59:06<00:00, 143.47s/it]  
# F1: 0.8256 ± 0.0149
# Male F1: 0.8505 ± 0.0405
# Female F1: 0.8204 ± 0.0144

Total chosen genes: 402
Total chosen genes (biased): 455
Reporting not biased results


100%|██████████| 100/100 [09:07<00:00,  5.48s/it]

F1: 0.8735 ± 0.0256
Male F1: 0.8517 ± 0.0372
Female F1: 0.8952 ± 0.0369
Reporting biased results





Best params: {'categorical_features': 'from_dtype', 'class_weight': None, 'early_stopping': 'auto', 'interaction_cst': None, 'l2_regularization': 0, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_bins': 255, 'max_depth': 64, 'max_features': 0.05, 'max_iter': 100, 'max_leaf_nodes': 31, 'min_samples_leaf': 5, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': None, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
{'max_depth': 64, 'learning_rate': 0.1, 'l2_regularization': 0, 'min_samples_leaf': 5, 'max_features': 0.05}


100%|██████████| 100/100 [3:59:06<00:00, 143.47s/it]  


F1: 0.8256 ± 0.0149
Male F1: 0.8505 ± 0.0405
Female F1: 0.8204 ± 0.0144


In [None]:
run_tests(
  category="min_tpm_5",
  not_biased_config=RunConfiguration(
    run_grid_search=True,
    default_parameters = {
      "l2_regularization": 0.5,
      "learning_rate": 0.1,
      "max_depth": 256,
      "max_features": 0.2,
      "min_samples_leaf": 10
    }
  ),
  biased_config=RunConfiguration(
    run_grid_search=True,
    default_parameters = {
      "learning_rate": 0.5,
      "l2_regularization": 1.5,
      "max_features": 0.3,
      "min_samples_leaf": 3,
      "max_depth": 8
  }
  )
)

# Total chosen genes: 394
# Total chosen genes (biased): 499
# Reporting not biased results
# {'max_depth': 32, 'learning_rate': 0.1, 'l2_regularization': 0, 'min_samples_leaf': 15, 'max_features': 0.2}
# 100%|██████████| 100/100 [1:30:43<00:00, 54.44s/it]   
# F1: 0.8918 ± 0.0241
# Male F1: 0.8816 ± 0.0328
# Female F1: 0.8969 ± 0.0327
# Reporting biased results

# {'max_depth': 32, 'learning_rate': 0.1, 'l2_regularization': 0.5, 'min_samples_leaf': 5, 'max_features': 0.15}
# 100%|██████████| 100/100 [03:51<00:00,  2.31s/it]
# F1: 0.8466 ± 0.0137
# Male F1: 0.8662 ± 0.0363
# Female F1: 0.8404 ± 0.0133

Total chosen genes: 394
Total chosen genes (biased): 499
Reporting not biased results
{'max_depth': 32, 'learning_rate': 0.1, 'l2_regularization': 0, 'min_samples_leaf': 15, 'max_features': 0.2}


100%|██████████| 100/100 [1:30:43<00:00, 54.44s/it]   

F1: 0.8918 ± 0.0241
Male F1: 0.8816 ± 0.0328
Female F1: 0.8969 ± 0.0327
Reporting biased results





{'max_depth': 32, 'learning_rate': 0.1, 'l2_regularization': 0.5, 'min_samples_leaf': 5, 'max_features': 0.15}


100%|██████████| 100/100 [03:51<00:00,  2.31s/it]

F1: 0.8466 ± 0.0137
Male F1: 0.8662 ± 0.0363
Female F1: 0.8404 ± 0.0133



