In [43]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../'))

In [45]:
import json
import pandas as pd
import warnings

from numpy import random
from collections import defaultdict
from functools import reduce

from sklearn.linear_model import LogisticRegression

from model.train import train_classifier
from utils import get_importances

DEFAULT_RANDOM_SEED = 774
random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
seed_list = list(random.random_integers(low=0, high=2**32 - 1, size=50))
warnings.filterwarnings("ignore")

In [46]:
category = "min_tpm_5"
data = pd.read_csv(f"../../preprocessed/{category}/genes.csv").drop(columns=["sample_id"])
subtypes = set(data["subtype"])

In [47]:



def choose_important_genes(data: pd.DataFrame, scoring: str):
  random.mtrand._rand.seed(DEFAULT_RANDOM_SEED)
  search_params = {"penalty": ("l1", "l2", "elasticnet"), "C": (0, 0.5, 1, 2)}
  importances_by_sex_subtype = defaultdict(list)

  for sex in ["Male", "Female"]:
    filtered_sex_dataset = data[data["sex"] == sex]
    for subtype in subtypes:
      targeted_data = filtered_sex_dataset.copy()
      targeted_data["subtype_target"] = targeted_data["subtype"] == subtype

      for seed in seed_list:
        random.mtrand._rand.seed(seed)
        model = LogisticRegression(class_weight="balanced", random_state=seed, solver="liblinear", max_iter=100)
        response = train_classifier(model, "subtype_target", data=targeted_data, grid_search_scoring=scoring, grid_search_params=search_params)
        importances = get_importances(response.model.coef_[0], response.feature_names, top=None)

        importances_by_sex_subtype[(sex, subtype)].append(importances)

  result = defaultdict(dict)
  for key, importances_list in importances_by_sex_subtype.items():
    importances_series: pd.Series = reduce(lambda x, y: x + y, importances_list) / float(len(importances_list))
    importances = [{ "gene": k, "coef": v * v } for k, v in importances_series.items() if v != 0]
    result[key[1]] |= { key[0]: sorted(importances, key=lambda x: x["coef"], reverse=True) }

  return result

In [None]:
result = choose_important_genes(data, scoring="recall_micro")
open(f"../../preprocessed/{category}/important_genes_logistic_recall.json", "w").write(json.dumps(result))

In [49]:
result = choose_important_genes(data, scoring="f1_weighted")
open(f"../../preprocessed/{category}/important_genes_logistic_f1.json", "w").write(json.dumps(result))

323770