In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))

In [39]:
import pandas as pd
import warnings

import json
from numpy import random
from dataclasses import dataclass

from model.utils import report_results
from model.gradient_boosting import train_gradient_boosting

from sklearn.ensemble import HistGradientBoostingClassifier

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
seed_list = random.random_integers(low=0, high=2**32 - 1, size=100)
warnings.filterwarnings("ignore")

search_params = {"learning_rate": (0.05, 0.1, 0.5, 1), "max_features": (0.05, 0.1, 0.15, 0.2, 0.5, "sqrt"), "l2_regularization": (0, 0.5, 1), "max_depth": (16, 32, 64, None)}

In [40]:
@dataclass
class RunConfiguration:
  run_grid_search: bool
  default_parameters: dict

In [41]:
def get_parameters(df: pd.DataFrame, run_config: RunConfiguration):
  if not run_config.run_grid_search:
    return run_config.default_parameters
  
  grid_search_response = train_gradient_boosting(target="subtype", data=df, grid_search_params=search_params)
  parameters = {k: grid_search_response.model.get_params()[k] for k in grid_search_response.model.get_params().keys() & search_params.keys() }
  print(parameters)
  return parameters

def run_tests(category: str, pvalues: str, select_genes, not_biased_config: RunConfiguration, biased_config: RunConfiguration = None):
  data = pd.read_csv(f"../../preprocessed/{category}/genes.csv").drop(columns=["sample_id"])
  pvalues = json.loads(open(f"../../preprocessed/{category}/important_genes_{pvalues}_pvalue.json").readline())

  chosen_genes_all = select_genes(pvalues)
  print(f"Total chosen genes: {len(chosen_genes_all)}")

  print("Reporting not biased results")
  df = data[["subtype", "sex", *chosen_genes_all]]
  print(report_results(df, HistGradientBoostingClassifier, get_parameters(df, not_biased_config), seed_list).report)

  if biased_config:
    print("Reporting biased results")
    chosen_genes_male = list(set([y["gene"] for x in [subtype_items["Male"][:50] for subtype_items in pvalues.values()] for y in x]))
    df = data[["subtype", "sex", *chosen_genes_male]]
    print(report_results(df, HistGradientBoostingClassifier, get_parameters(df, biased_config), seed_list).report)

    chosen_genes_female = list(set([y["gene"] for x in [subtype_items["Female"][:50] for subtype_items in pvalues.values()] for y in x]))
    df = data[["subtype", "sex", *chosen_genes_female]]
    print(report_results(df, HistGradientBoostingClassifier, get_parameters(df, biased_config), seed_list).report)

# TTest 25 genes per sex-subtype

In [43]:
run_tests(
  category="min_tpm_5",
  pvalues="ttest",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [sex_values[:25] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'max_features': 0.05, 'learning_rate': 0.1, 'max_depth': 64, 'l2_regularization': 0}
  ),
  biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'max_features': 0.05, 'learning_rate': 0.1, 'max_depth': None, 'l2_regularization': 1}
  )
)

Total chosen genes: 375
Reporting not biased results


100%|██████████| 100/100 [16:00<00:00,  9.61s/it]


           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8760 ± 0.0252  0.8582 ± 0.0369  0.8936 ± 0.0323
1      F1 (Macro)  0.8384 ± 0.0351  0.8016 ± 0.0539  0.8584 ± 0.0451
2  Recall (Macro)  0.8248 ± 0.0346  0.7955 ± 0.0496  0.8493 ± 0.0431
3         ROC AUC  0.9890 ± 0.0060  0.9868 ± 0.0067  0.9917 ± 0.0091
4        Accuracy  0.8819 ± 0.0233  0.8701 ± 0.0339  0.8955 ± 0.0287
5        Duration  8.9554 ± 0.2114                0                0
Reporting biased results


100%|██████████| 100/100 [11:23<00:00,  6.83s/it]


           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8706 ± 0.0254  0.8548 ± 0.0354  0.8734 ± 0.0324
1      F1 (Macro)  0.8256 ± 0.0369  0.7957 ± 0.0534  0.8313 ± 0.0461
2  Recall (Macro)  0.8132 ± 0.0339  0.7952 ± 0.0464  0.8222 ± 0.0431
3         ROC AUC  0.9868 ± 0.0063  0.9879 ± 0.0066  0.9879 ± 0.0105
4        Accuracy  0.8750 ± 0.0230  0.8701 ± 0.0316  0.8806 ± 0.0291
5        Duration  6.1940 ± 0.1802                0                0


100%|██████████| 100/100 [11:51<00:00,  7.11s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8579 ± 0.0265  0.8270 ± 0.0341  0.8773 ± 0.0353
1      F1 (Macro)  0.8128 ± 0.0362  0.7599 ± 0.0451  0.8376 ± 0.0498
2  Recall (Macro)  0.8023 ± 0.0337  0.7596 ± 0.0414  0.8333 ± 0.0457
3         ROC AUC  0.9851 ± 0.0065  0.9816 ± 0.0076  0.9886 ± 0.0094
4        Accuracy  0.8681 ± 0.0246  0.8442 ± 0.0329  0.8806 ± 0.0311
5        Duration  6.4267 ± 0.1364                0                0





## TTest overall (without grouping by sex), 45 genes per subtype

In [44]:
run_tests(
  category="min_tpm_5",
  pvalues="ttest_overall",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [subtype_items[:45] for subtype_items in pvalues.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'max_features': 0.05, 'learning_rate': 0.1, 'max_depth': 64, 'l2_regularization': 0.5}
  ),
  biased_config=None
)

Total chosen genes: 384
Reporting not biased results


100%|██████████| 100/100 [05:51<00:00,  3.51s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8660 ± 0.0233  0.8478 ± 0.0337  0.8811 ± 0.0324
1      F1 (Macro)  0.8259 ± 0.0335  0.7818 ± 0.0509  0.8417 ± 0.0470
2  Recall (Macro)  0.8102 ± 0.0322  0.7829 ± 0.0459  0.8351 ± 0.0439
3         ROC AUC  0.9871 ± 0.0059  0.9860 ± 0.0071  0.9887 ± 0.0085
4        Accuracy  0.8750 ± 0.0218  0.8571 ± 0.0308  0.8955 ± 0.0284
5        Duration  2.0413 ± 1.8806                0                0





## TTest first 400 genes by pvalue

In [45]:
run_tests(
  category="min_tpm_5",
  pvalues="ttest",
  select_genes=lambda pvalues: list(set([x["gene"] for x in sorted([z for x in pvalues.values() for y in x.values() for z in y], key=lambda x: x["pvalue"])]))[:400],
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'max_features': 0.1, 'learning_rate': 0.1, 'max_depth': 32, 'l2_regularization': 0}
  ),
  biased_config=None
)

Total chosen genes: 400
Reporting not biased results


100%|██████████| 100/100 [05:50<00:00,  3.50s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8451 ± 0.0281  0.8303 ± 0.0389  0.8618 ± 0.0365
1      F1 (Macro)  0.8028 ± 0.0377  0.7815 ± 0.0525  0.8203 ± 0.0497
2  Recall (Macro)  0.7917 ± 0.0364  0.7733 ± 0.0489  0.8108 ± 0.0484
3         ROC AUC  0.9807 ± 0.0080  0.9789 ± 0.0098  0.9820 ± 0.0107
4        Accuracy  0.8542 ± 0.0268  0.8442 ± 0.0365  0.8657 ± 0.0338
5        Duration  3.3601 ± 0.1015                0                0





## TTest overall first 400 genes by pvalue

In [46]:
run_tests(
  category="min_tpm_5",
  pvalues="ttest_overall",
  select_genes=lambda pvalues: list(set([x["gene"] for x in sorted([x for subtype_items in pvalues.values() for x in subtype_items], key=lambda x: x["pvalue"])]))[:400],
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'max_features': 0.15, 'learning_rate': 0.05, 'max_depth': 64, 'l2_regularization': 0.5}
  ),
  biased_config=None
)

Total chosen genes: 400
Reporting not biased results


100%|██████████| 100/100 [04:10<00:00,  2.51s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8441 ± 0.0272  0.8334 ± 0.0380  0.8538 ± 0.0371
1      F1 (Macro)  0.8070 ± 0.0365  0.7876 ± 0.0513  0.8175 ± 0.0514
2  Recall (Macro)  0.7927 ± 0.0363  0.7795 ± 0.0478  0.8050 ± 0.0492
3         ROC AUC  0.9795 ± 0.0072  0.9786 ± 0.0089  0.9797 ± 0.0097
4        Accuracy  0.8472 ± 0.0254  0.8442 ± 0.0352  0.8657 ± 0.0342
5        Duration  2.3723 ± 0.0820                0                0





In [47]:
run_tests(
  category="min_tpm_5",
  pvalues="wilcoxon",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [sex_values[:25] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'max_features': 0.2, 'learning_rate': 0.1, 'max_depth': 16, 'l2_regularization': 1}
  )
)

Total chosen genes: 411
Reporting not biased results


100%|██████████| 100/100 [03:20<00:00,  2.00s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8849 ± 0.0234  0.8781 ± 0.0335  0.8875 ± 0.0324
1      F1 (Macro)  0.8540 ± 0.0325  0.8425 ± 0.0470  0.8527 ± 0.0455
2  Recall (Macro)  0.8404 ± 0.0330  0.8332 ± 0.0454  0.8476 ± 0.0448
3         ROC AUC  0.9886 ± 0.0057  0.9884 ± 0.0053  0.9918 ± 0.0088
4        Accuracy  0.8889 ± 0.0223  0.8831 ± 0.0311  0.8955 ± 0.0301
5        Duration  1.8668 ± 0.0954                0                0





In [48]:
run_tests(
  category="min_tpm_5",
  pvalues="cramervonmises",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [sex_values[:25] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'max_features': 0.05, 'learning_rate': 0.1, 'max_depth': None, 'l2_regularization': 0}
  )
)

Total chosen genes: 358
Reporting not biased results


100%|██████████| 100/100 [05:10<00:00,  3.11s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8764 ± 0.0239  0.8646 ± 0.0327  0.8827 ± 0.0313
1      F1 (Macro)  0.8322 ± 0.0331  0.8138 ± 0.0493  0.8440 ± 0.0439
2  Recall (Macro)  0.8186 ± 0.0313  0.8082 ± 0.0429  0.8340 ± 0.0407
3         ROC AUC  0.9861 ± 0.0059  0.9869 ± 0.0064  0.9881 ± 0.0087
4        Accuracy  0.8819 ± 0.0217  0.8766 ± 0.0287  0.8955 ± 0.0279
5        Duration  2.9774 ± 0.0752                0                0





In [49]:
run_tests(
  category="min_tpm_5",
  pvalues="brunnermunzel",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [sex_values[:25] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'max_features': 0.05, 'learning_rate': 0.1, 'max_depth': None, 'l2_regularization': 0}
  )
)

Total chosen genes: 379
Reporting not biased results


100%|██████████| 100/100 [05:10<00:00,  3.11s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8953 ± 0.0203  0.8818 ± 0.0283  0.9044 ± 0.0289
1      F1 (Macro)  0.8677 ± 0.0314  0.8513 ± 0.0464  0.8768 ± 0.0415
2  Recall (Macro)  0.8545 ± 0.0317  0.8405 ± 0.0441  0.8694 ± 0.0408
3         ROC AUC  0.9924 ± 0.0040  0.9924 ± 0.0045  0.9924 ± 0.0063
4        Accuracy  0.9028 ± 0.0187  0.8896 ± 0.0255  0.9104 ± 0.0263
5        Duration  2.9610 ± 0.1100                0                0





In [50]:
run_tests(
  category="min_tpm_5",
  pvalues="epps_singleton",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [sex_values[:25] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'max_features': 0.05, 'learning_rate': 0.1, 'max_depth': 32, 'l2_regularization': 0}
  )
)

Total chosen genes: 399
Reporting not biased results


100%|██████████| 100/100 [13:16<00:00,  7.96s/it]

           Metric          Overall             Male           Female
0   F1 (Weighted)  0.8912 ± 0.0217  0.8759 ± 0.0327  0.9036 ± 0.0278
1      F1 (Macro)  0.8604 ± 0.0308  0.8350 ± 0.0483  0.8715 ± 0.0398
2  Recall (Macro)  0.8472 ± 0.0299  0.8286 ± 0.0458  0.8611 ± 0.0382
3         ROC AUC  0.9915 ± 0.0046  0.9901 ± 0.0053  0.9927 ± 0.0070
4        Accuracy  0.8958 ± 0.0206  0.8831 ± 0.0305  0.9104 ± 0.0247
5        Duration  4.2799 ± 8.0843                0                0





In [51]:
run_tests(
  category="min_tpm_5",
  pvalues="anderson_ksamp",
  select_genes=lambda pvalues: list(set([y["gene"] for x in [sex_values[:25] for subtype_items in pvalues.values() for sex_values in subtype_items.values()] for y in x])),
  not_biased_config=RunConfiguration(
    run_grid_search=False,
    default_parameters={'max_features': 0.05, 'learning_rate': 0.1, 'max_depth': 64, 'l2_regularization': 0}
  )
)

Total chosen genes: 242
Reporting not biased results


100%|██████████| 100/100 [19:24<00:00, 11.65s/it]

           Metric           Overall             Male           Female
0   F1 (Weighted)   0.8525 ± 0.0262  0.8358 ± 0.0381  0.8667 ± 0.0311
1      F1 (Macro)   0.8177 ± 0.0365  0.7971 ± 0.0539  0.8261 ± 0.0438
2  Recall (Macro)   0.8069 ± 0.0351  0.7947 ± 0.0498  0.8192 ± 0.0426
3         ROC AUC   0.9845 ± 0.0072  0.9827 ± 0.0098  0.9894 ± 0.0102
4        Accuracy   0.8611 ± 0.0250  0.8442 ± 0.0355  0.8806 ± 0.0284
5        Duration  8.6642 ± 14.3025                0                0



