In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))

In [4]:
import pandas as pd
import warnings

import json
from numpy import random
from dataclasses import dataclass

from model.utils import report_results
from model.gradient_boosting import train_gradient_boosting

from sklearn.ensemble import HistGradientBoostingClassifier

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)
warnings.filterwarnings("ignore")

search_params = {"learning_rate": (0.05, 0.1, 0.5, 1), "max_features": (0.05, 0.1, 0.15, 0.2, 0.5, "sqrt"), "l2_regularization": (0, 0.5, 1), "max_depth": (16, 32, 64, None)}

  seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)


In [5]:
@dataclass
class RunConfiguration:
  run_grid_search: bool
  default_parameters: dict
  polynomial_degree: int

In [6]:
def get_parameters(df: pd.DataFrame, run_config: RunConfiguration):
  if not run_config.run_grid_search:
    return run_config.default_parameters
  
  grid_search_response = train_gradient_boosting(target="subtype", data=df, grid_search_params=search_params, polynomial_degree=run_config.polynomial_degree)
  parameters = {k: grid_search_response.model.get_params()[k] for k in grid_search_response.model.get_params().keys() & search_params.keys() }
  print(parameters)
  return parameters

def run_tests(category: str, pvalues: str, select_genes, run_config: RunConfiguration):
  data = pd.read_csv(f"../../preprocessed/{category}/genes.csv").drop(columns=["sample_id"])
  pvalues = json.loads(open(f"../../preprocessed/{category}/important_genes_{pvalues}_pvalue.json").readline())

  chosen_genes_all = select_genes(pvalues)
  print(f"Total chosen genes: {len(chosen_genes_all)}")

  df = data[["subtype", "sex", *chosen_genes_all]]
  print(report_results(df, HistGradientBoostingClassifier, get_parameters(df, run_config), seed_list, polynomial_degree=run_config.polynomial_degree).report)

# TTest 25 genes per sex-subtype

In [7]:
run_tests(
  category="min_tpm_5",
  pvalues="ttest",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [sex_values[:5] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x])),
  run_config=RunConfiguration(
    run_grid_search=True,
    polynomial_degree=2,
    default_parameters = {
      "l2_regularization": 0,
      "learning_rate": 0.1,
      "max_depth": 32,
      "max_features": 0.2,
      "min_samples_leaf": 15
    }
  )
)

Total chosen genes: 80


KeyboardInterrupt: 