In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))

In [3]:
import pandas as pd
import warnings

from numpy import random
from dataclasses import dataclass

from model.utils import report_results
from model.gradient_boosting import train_gradient_boosting

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)
warnings.filterwarnings("ignore")

search_params = {"learning_rate": (0.05, 0.1, 0.5, 1), "max_features": (0.05, 0.1, 0.15, 0.2, 0.5,), "min_samples_leaf": (5, 10, 15, 20), "l2_regularization": (0, 0.5, 1), "max_depth": (16, 32, 64, 128, 256)}

  seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)


In [4]:
@dataclass
class RunConfiguration:
  run_grid_search: bool
  default_parameters: dict

In [None]:
from model.forest import train_forest
from utils import get_importances


def choose_important_genes(data: pd.DataFrame):
  random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)

  importances_by_subtype = {}
  forest_params = {
    "n_estimators": 64,
    "max_depth": 2,
    "class_weight": "balanced"
  }

  f1s_by_subtype = {}
  tops = list(range(20, 80, 5))

  genes = set()
  for subtype in data["subtype"].unique():
    targeted_data = data.copy()
    targeted_data["subtype_target"] = targeted_data["subtype"] == subtype
    response = train_forest("subtype_target", data=targeted_data, **forest_params)

    f1s_by_subtype[subtype] = []
    targets = []
    for top in tops:
      importances_target = get_importances(response.model.feature_importances_, response.feature_names, top=top)
      current_important_data = targeted_data.copy()[["subtype_target", *importances_target.index]]

      search_params = None
      if subtype in ["iAMP21", "BCRABL1"]:
        search_params = {"n_estimators": (2, 4, 8, 16, 32, 64), "max_depth": (2, 4, 16, 64)}

      subset_response = train_forest("subtype_target", data=current_important_data, grid_search_scoring="recall", grid_search_params=search_params, **forest_params)
      f1s_by_subtype[subtype].append(subset_response.f1)
      targets.append(importances_target)

    i, _ = max(enumerate(f1s_by_subtype[subtype]), key=lambda x: x[1])

    importances_target = targets[i]
    importances_by_subtype[subtype] = importances_target
    genes |= set(importances_target.index)

  return list(genes)

def get_parameters(df: pd.DataFrame, run_config: RunConfiguration):
  if not run_config.run_grid_search:
    return run_config.default_parameters
  
  grid_search_response = train_gradient_boosting(target="subtype", data=df, grid_search_params=search_params)
  parameters = {k: grid_search_response.model.get_params()[k] for k in grid_search_response.model.get_params().keys() & search_params.keys() }
  print(parameters)
  return parameters

def run_tests(category: str, not_biased_config: RunConfiguration, biased_config: RunConfiguration):
  data = pd.read_csv(f"../../preprocessed/{category}/genes.csv").drop(columns=["sample_id"])

  chosen_genes_all = choose_important_genes(data[data["sex"] == "Male"])
  chosen_genes_male = choose_important_genes(data[data["sex"] == "Male"])
  print(f"Total chosen genes: {len(chosen_genes_all)}")
  print(f"Total chosen genes (biased): {len(chosen_genes_male)}")

  print("Reporting not biased results")
  df = data[["subtype", "sex", *chosen_genes_all]]
  report_results(df, get_parameters(df, not_biased_config), seed_list, is_biased=False)

  print("Reporting biased results")
  df = data[["subtype", "sex", *chosen_genes_male]]
  report_results(df, get_parameters(df[df["sex"] == "Male"], biased_config), seed_list, is_biased=True)

In [None]:
run_tests(
  category="min_tpm_0",
  not_biased_config=RunConfiguration(
    run_grid_search=True,
    default_parameters = { "max_depth": 64, "learning_rate": 0.1, "l2_regularization": 0, "min_samples_leaf": 20, "max_features": 0.1 }
  ),
  biased_config=RunConfiguration(
    run_grid_search=True,
    default_parameters = { "max_depth": 32, "learning_rate": 0.1, "l2_regularization": 0, "min_samples_leaf": 15, "max_features": 0.05 }
  )
)

# Total chosen genes: 249
# Total chosen genes (biased): 215
# Reporting not biased results
# {'max_features': 0.15, 'max_depth': 64, 'min_samples_leaf': 20, 'l2_regularization': 0.5, 'learning_rate': 0.1}
# 100%|██████████| 100/100 [02:59<00:00,  1.80s/it]
# F1: 0.8824 ± 0.0264
# Male F1: 0.8812 ± 0.0338
# Female F1: 0.8812 ± 0.0361
# Reporting biased results
# {'max_features': 0.2, 'max_depth': 128, 'min_samples_leaf': 10, 'l2_regularization': 1, 'learning_rate': 0.5}
# 100%|██████████| 100/100 [02:18<00:00,  1.38s/it]
# F1: 0.8209 ± 0.0166
# Male F1: 0.8312 ± 0.0404
# Female F1: 0.8169 ± 0.0179

Total chosen genes: 249
Total chosen genes (biased): 215
Reporting not biased results
{'max_features': 0.15, 'max_depth': 64, 'min_samples_leaf': 20, 'l2_regularization': 0.5, 'learning_rate': 0.1}


100%|██████████| 100/100 [02:59<00:00,  1.80s/it]


F1: 0.8824 ± 0.0264
Male F1: 0.8812 ± 0.0338
Female F1: 0.8812 ± 0.0361
Reporting biased results
{'max_features': 0.2, 'max_depth': 128, 'min_samples_leaf': 10, 'l2_regularization': 1, 'learning_rate': 0.5}


100%|██████████| 100/100 [02:18<00:00,  1.38s/it]

F1: 0.8209 ± 0.0166
Male F1: 0.8312 ± 0.0404
Female F1: 0.8169 ± 0.0179





In [None]:
run_tests(
  category="min_tpm_5",
  not_biased_config=RunConfiguration(
    run_grid_search=True,
    default_parameters = {
      "l2_regularization": 0.5,
      "learning_rate": 0.1,
      "max_depth": 256,
      "max_features": 0.2,
      "min_samples_leaf": 10
    }
  ),
  biased_config=RunConfiguration(
    run_grid_search=True,
    default_parameters = {
      "learning_rate": 0.5,
      "l2_regularization": 1.5,
      "max_features": 0.3,
      "min_samples_leaf": 3,
      "max_depth": 8
  }
  )
)

# Total chosen genes: 277
# Total chosen genes (biased): 246
# Reporting not biased results
# {'max_features': 0.1, 'max_depth': 16, 'min_samples_leaf': 20, 'l2_regularization': 1, 'learning_rate': 0.5}
# 100%|██████████| 100/100 [3:39:12<00:00, 131.52s/it] 
# F1: 0.8642 ± 0.0289
# Male F1: 0.8626 ± 0.0377
# Female F1: 0.8629 ± 0.0425
# Reporting biased results

# {'max_features': 0.2, 'max_depth': 64, 'min_samples_leaf': 15, 'l2_regularization': 1, 'learning_rate': 0.5}
# 100%|██████████| 100/100 [03:04<00:00,  1.85s/it]
# F1: 0.8438 ± 0.0160
# Male F1: 0.8456 ± 0.0392
# Female F1: 0.8422 ± 0.0164

Total chosen genes: 277
Total chosen genes (biased): 246
Reporting not biased results
{'max_features': 0.1, 'max_depth': 16, 'min_samples_leaf': 20, 'l2_regularization': 1, 'learning_rate': 0.5}


100%|██████████| 100/100 [3:39:12<00:00, 131.52s/it] 

F1: 0.8642 ± 0.0289
Male F1: 0.8626 ± 0.0377
Female F1: 0.8629 ± 0.0425
Reporting biased results





{'max_features': 0.2, 'max_depth': 64, 'min_samples_leaf': 15, 'l2_regularization': 1, 'learning_rate': 0.5}


100%|██████████| 100/100 [03:04<00:00,  1.85s/it]

F1: 0.8438 ± 0.0160
Male F1: 0.8456 ± 0.0392
Female F1: 0.8422 ± 0.0164



