In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
from collections import defaultdict

import json
import pandas as pd

from scipy.stats import ttest_ind, wilcoxon, cramervonmises_2samp
from numpy import random

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)

In [3]:
category = "min_tpm_5"

In [4]:
extra_data_headers = pd.read_csv('../data/extra_data.tsv', delimiter="\t", nrows=0).columns
data = pd.read_csv(f"../preprocessed/{category}/genes.csv", delimiter=",", decimal='.')

subtypes = set(data["subtype"])
genes = set(data.columns) - set(extra_data_headers) - set(["prognostic"])

In [None]:
def select_important_genes_by_sex(metric):
  important_genes_by_sex_subtype = defaultdict(list)

  for sex in ["Male", "Female"]:
    filtered_sex_dataset = data[data["sex"] == sex]
    for subtype in subtypes:
      subtype_dataset = filtered_sex_dataset[filtered_sex_dataset["subtype"] == subtype]
      not_subtype_dataset = filtered_sex_dataset[filtered_sex_dataset["subtype"] != subtype]

      for gene in genes:
        result = metric(subtype_dataset[gene], not_subtype_dataset[gene])
        if (result.pvalue <= 0.001):
          important_genes_by_sex_subtype[(sex, subtype)].append({ "gene": gene, "pvalue": result.pvalue })

  result = defaultdict(dict)
  for key in important_genes_by_sex_subtype.keys():
    result[key[1]] |= { key[0]: sorted(important_genes_by_sex_subtype[key], key=lambda x: x["pvalue"]) }

  return result


In [None]:
ttest_result = select_important_genes_by_sex(lambda x, y: ttest_ind(x, y, equal_var=False))

Total selected genes: 13120


In [None]:
wilcoxon_result = select_important_genes_by_sex(lambda x, y: wilcoxon(x, y.sample(len(x)), zero_method="zsplit"))

  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)


Total selected genes: 9912


In [None]:
cramervonmises_result = select_important_genes_by_sex(cramervonmises_2samp)

Total selected genes: 13171


In [14]:
open(f"../preprocessed/{category}/important_genes_ttest_pvalue.json", "w").write(json.dumps(ttest_result))
open(f"../preprocessed/{category}/important_genes_wilcoxon_pvalue.json", "w").write(json.dumps(wilcoxon_result))
open(f"../preprocessed/{category}/important_genes_cramervonmises_pvalue.json", "w").write(json.dumps(cramervonmises_result))

3343377

In [24]:
def select_important_genes_overall(metric):
  important_genes_by_subtype = defaultdict(list)

  for subtype in subtypes:
    subtype_dataset = data[data["subtype"] == subtype]
    not_subtype_dataset = data[data["subtype"] != subtype]

    for gene in genes:
      result = metric(subtype_dataset[gene], not_subtype_dataset[gene])
      if (result.pvalue <= 0.001):
        important_genes_by_subtype[subtype].append({ "gene": gene, "pvalue": result.pvalue })

  result = {}
  for key in important_genes_by_subtype.keys():
    result[key] = sorted(important_genes_by_subtype[key], key=lambda x: x["pvalue"])

  return result


In [25]:
ttest_result = select_important_genes_overall(lambda x, y: ttest_ind(x, y, equal_var=False))

In [26]:
open(f"../preprocessed/{category}/important_genes_ttest_overall_pvalue.json", "w").write(json.dumps(ttest_result))

2467220