In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))

In [4]:
import pandas as pd
import warnings

import json
from numpy import random
from dataclasses import dataclass

from model.utils import report_results
from model.gradient_boosting import train_gradient_boosting

from sklearn.ensemble import HistGradientBoostingClassifier

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)
warnings.filterwarnings("ignore")

search_params = {"learning_rate": (0.05, 0.1, 0.5, 1), "max_features": (0.05, 0.1, 0.15, 0.2, 0.5, "sqrt"), "l2_regularization": (0, 0.5, 1), "max_depth": (16, 32, 64, None)}

  seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)


In [5]:
@dataclass
class RunConfiguration:
  run_grid_search: bool
  default_parameters: dict

In [None]:
def get_parameters(df: pd.DataFrame, run_config: RunConfiguration):
  if not run_config.run_grid_search:
    return run_config.default_parameters
  
  grid_search_response = train_gradient_boosting(target="subtype", data=df, grid_search_params=search_params)
  parameters = {k: grid_search_response.model.get_params()[k] for k in grid_search_response.model.get_params().keys() & search_params.keys() }
  print(parameters)
  return parameters

def run_tests(category: str, pvalues: str, select_genes, not_biased_config: RunConfiguration, biased_config: RunConfiguration = None):
  data = pd.read_csv(f"../../preprocessed/{category}/genes.csv").drop(columns=["sample_id"])
  pvalues = json.loads(open(f"../../preprocessed/{category}/important_genes_{pvalues}_pvalue.json").readline())

  chosen_genes_all = select_genes(pvalues)
  print(f"Total chosen genes: {len(chosen_genes_all)}")

  print("Reporting not biased results")
  df = data[["subtype", "sex", *chosen_genes_all]]
  print(report_results(df, HistGradientBoostingClassifier, get_parameters(df, not_biased_config), seed_list).report)

  if biased_config:
    print("Reporting biased results")
    chosen_genes_male = list(set([y["gene"] for x in [subtype_items["Male"][:50] for subtype_items in pvalues.values()] for y in x]))
    df = data[["subtype", "sex", *chosen_genes_male]]
    print(report_results(df, HistGradientBoostingClassifier, get_parameters(df, biased_config), seed_list).report)

    chosen_genes_female = list(set([y["gene"] for x in [subtype_items["Female"][:50] for subtype_items in pvalues.values()] for y in x]))
    df = data[["subtype", "sex", *chosen_genes_female]]
    print(report_results(df, HistGradientBoostingClassifier, get_parameters(df, biased_config), seed_list).report)

# TTest 25 genes per sex-subtype

In [None]:
run_tests(
  category="min_tpm_5",
  pvalues="ttest",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [sex_values[:25] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=True,
    default_parameters={'learning_rate': 0.1, 'max_features': 0.1, 'l2_regularization': 0, 'max_depth': None}
  ),
  biased_config=RunConfiguration(
    run_grid_search=True,
    default_parameters={'learning_rate': 0.1, 'max_features': 0.05, 'l2_regularization': 0, 'max_depth': 32}
  )
)

Total chosen genes: 375
Reporting not biased results
{'learning_rate': 0.1, 'max_features': 0.1, 'l2_regularization': 0, 'max_depth': None}


100%|██████████| 100/100 [08:30<00:00,  5.10s/it]

     Metric          Overall             Male           Female
0        F1  0.8761 ± 0.0267  0.8518 ± 0.0374  0.8994 ± 0.0368
1    Recall  0.8819 ± 0.0244  0.8571 ± 0.0343  0.9104 ± 0.0320
2   ROC AUC  0.9873 ± 0.0061  0.9865 ± 0.0072  0.9903 ± 0.0090
3  Accuracy  0.8819 ± 0.0244  0.8571 ± 0.0343  0.9104 ± 0.0320
Reporting biased results





{'learning_rate': 0.1, 'max_features': 0.05, 'l2_regularization': 0, 'max_depth': 32}


100%|██████████| 100/100 [06:57<00:00,  4.17s/it]

     Metric          Overall             Male           Female
0        F1  0.8795 ± 0.0281  0.8695 ± 0.0405  0.8854 ± 0.0314
1    Recall  0.8889 ± 0.0258  0.8831 ± 0.0369  0.8955 ± 0.0275
2   ROC AUC  0.9872 ± 0.0063  0.9889 ± 0.0063  0.9881 ± 0.0102
3  Accuracy  0.8889 ± 0.0258  0.8831 ± 0.0369  0.8955 ± 0.0275





{'learning_rate': 0.1, 'max_features': 0.1, 'l2_regularization': 0.5, 'max_depth': 16}


100%|██████████| 100/100 [20:05<00:00, 12.06s/it]  

     Metric          Overall             Male           Female
0        F1  0.8524 ± 0.0267  0.8272 ± 0.0386  0.8809 ± 0.0351
1    Recall  0.8611 ± 0.0249  0.8442 ± 0.0360  0.8955 ± 0.0305
2   ROC AUC  0.9847 ± 0.0063  0.9804 ± 0.0074  0.9890 ± 0.0087
3  Accuracy  0.8611 ± 0.0249  0.8442 ± 0.0360  0.8955 ± 0.0305





## TTest overall (without grouping by sex), 45 genes per subtype

In [None]:
run_tests(
  category="min_tpm_5",
  pvalues="ttest_overall",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [subtype_items[:45] for subtype_items in pvalues.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters = {
      "l2_regularization": 0,
      "learning_rate": 0.1,
      "max_depth": 32,
      "max_features": 0.2,
      "min_samples_leaf": 15
    }
  ),
  biased_config=None
)

Total chosen genes: 384
Reporting not biased results


100%|██████████| 100/100 [10:05<00:00,  6.06s/it]

     Metric          Overall             Male           Female
0        F1  0.8657 ± 0.0288  0.8400 ± 0.0373  0.8852 ± 0.0364
1    Recall  0.8715 ± 0.0264  0.8571 ± 0.0342  0.8955 ± 0.0325
2   ROC AUC  0.9843 ± 0.0064  0.9844 ± 0.0083  0.9875 ± 0.0092
3  Accuracy  0.8715 ± 0.0264  0.8571 ± 0.0342  0.8955 ± 0.0325





## TTest first 400 genes by pvalue

In [None]:
run_tests(
  category="min_tpm_5",
  pvalues="ttest",
  select_genes=lambda pvalues: list(set([x["gene"] for x in sorted([z for x in pvalues.values() for y in x.values() for z in y], key=lambda x: x["pvalue"])]))[:400],
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters = {
      "l2_regularization": 0,
      "learning_rate": 0.1,
      "max_depth": 32,
      "max_features": 0.2,
      "min_samples_leaf": 15
    }
  ),
  biased_config=None
)

Total chosen genes: 400
Reporting not biased results


100%|██████████| 100/100 [13:43<00:00,  8.23s/it]

     Metric          Overall             Male           Female
0        F1  0.8231 ± 0.0270  0.8116 ± 0.0343  0.8249 ± 0.0381
1    Recall  0.8333 ± 0.0256  0.8182 ± 0.0329  0.8358 ± 0.0352
2   ROC AUC  0.9741 ± 0.0088  0.9710 ± 0.0105  0.9770 ± 0.0123
3  Accuracy  0.8333 ± 0.0256  0.8182 ± 0.0329  0.8358 ± 0.0352





## TTest overall first 400 genes by pvalue

In [None]:
run_tests(
  category="min_tpm_5",
  pvalues="ttest_overall",
  select_genes=lambda pvalues: list(set([x["gene"] for x in sorted([x for subtype_items in pvalues.values() for x in subtype_items], key=lambda x: x["pvalue"])]))[:400],
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters = {
      "l2_regularization": 0,
      "learning_rate": 0.1,
      "max_depth": 32,
      "max_features": 0.2,
      "min_samples_leaf": 15
    }
  ),
  biased_config=None
)

Total chosen genes: 400
Reporting not biased results


100%|██████████| 100/100 [11:43<00:00,  7.03s/it]

     Metric          Overall             Male           Female
0        F1  0.8166 ± 0.0274  0.8118 ± 0.0378  0.8210 ± 0.0344
1    Recall  0.8264 ± 0.0261  0.8182 ± 0.0358  0.8358 ± 0.0322
2   ROC AUC  0.9736 ± 0.0091  0.9691 ± 0.0112  0.9773 ± 0.0122
3  Accuracy  0.8264 ± 0.0261  0.8182 ± 0.0358  0.8358 ± 0.0322





# Wilcoxon

In [7]:
run_tests(
  category="min_tpm_5",
  pvalues="wilcoxon",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [sex_values[:25] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'learning_rate': 0.1, 'max_features': 0.1, 'l2_regularization': 0, 'max_depth': None}
  )
)

Total chosen genes: 411
Reporting not biased results


100%|██████████| 100/100 [36:29<00:00, 21.89s/it]


     Metric          Overall             Male           Female
0        F1  0.8913 ± 0.0212  0.8799 ± 0.0303  0.8947 ± 0.0298
1    Recall  0.8958 ± 0.0199  0.8831 ± 0.0278  0.8955 ± 0.0272
2   ROC AUC  0.9897 ± 0.0053  0.9897 ± 0.0052  0.9915 ± 0.0081
3  Accuracy  0.8958 ± 0.0199  0.8831 ± 0.0278  0.8955 ± 0.0272


In [8]:
run_tests(
  category="min_tpm_5",
  pvalues="cramervonmises",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [sex_values[:25] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'learning_rate': 0.1, 'max_features': 0.1, 'l2_regularization': 0, 'max_depth': None}
  )
)

Total chosen genes: 358
Reporting not biased results


100%|██████████| 100/100 [47:38<00:00, 28.58s/it]  


     Metric          Overall             Male           Female
0        F1  0.8744 ± 0.0231  0.8650 ± 0.0335  0.8818 ± 0.0329
1    Recall  0.8819 ± 0.0210  0.8701 ± 0.0298  0.8955 ± 0.0285
2   ROC AUC  0.9869 ± 0.0062  0.9866 ± 0.0066  0.9885 ± 0.0085
3  Accuracy  0.8819 ± 0.0210  0.8701 ± 0.0298  0.8955 ± 0.0285
