In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV
from model.forest import train_forest

from model.gradient_boosting import train_gradient_boosting
from utils import show_cluster, plot_clustermap, plot_legend, get_importances

In [None]:
def genes_data(genes) -> str:
    return ", ".join([f"{x[0]} ({average_by_gene_by_subtype[x[0]][subtype]["mean"]:.4f} ± {average_by_gene_by_subtype[x[0]][subtype]["stddev"]:.4f})" for x in genes])

subtypes = ["BCRABL1", "DUX4IGH", "HYPER", "HYPO", "iAMP21", "KMT2A", "PAX5", "PHlike", "TCF3PBX1", "ETV6RUNX1"]
average_by_gene_by_subtype = json.loads(open(f"../data/data_by_gene_by_subtype.json", "r").readline())

genes_by_subtype = {}
for subtype in subtypes:
    male: dict = json.loads(open(f"../results/gene_results_male_{subtype}.csv", "r").readline())
    female: dict = json.loads(open(f"../results/gene_results_female_{subtype}.csv", "r").readline())

    top_median_male = sorted(male.items(), key=lambda x: x[1]["median"], reverse=True)[:10]
    top_median_female = sorted(female.items(), key=lambda x: x[1]["median"], reverse=True)[:10]

    top_hits_male = sorted(male.items(), key=lambda x: x[1]["hits"], reverse=True)[:10]
    top_hits_female = sorted(female.items(), key=lambda x: x[1]["hits"], reverse=True)[:10]

    print(f"==================== {subtype} ====================")
    print(f"Intersection size median: {len(set([x[0] for x in top_median_male]) & set([x[0] for x in top_median_female]))}")
    print(f"Intersection size hits: {len(set([x[0] for x in top_hits_male]) & set([x[0] for x in top_hits_female]))}")
    print(f"Intersection (hits): {set([x[0] for x in top_hits_male]) & set([x[0] for x in top_hits_female])}")
    print(f"Top hits male: {genes_data(top_hits_male)}")
    print(f"Top hits female: {genes_data(top_hits_female)}")
    print(f"Top median male: {genes_data(top_median_male)}")
    print(f"Top median female: {genes_data(top_median_female)}")
    print()

    genes_by_subtype[subtype] = set([x[0] for x in top_hits_male]) | set([x[0] for x in top_hits_female])
    


Intersection size median: 3
Intersection size hits: 5
Intersection (hits): {'ENSG00000285718', 'AC091133.4', 'DSC3', 'KCNN1', 'AK7'}
Top hits male: KCNN1 (88.0484 ± 85.9929), DSC3 (32.3906 ± 30.0936), IGF2BP1 (51.6504 ± 47.2826), AC091133.4 (6.8506 ± 5.2496), AK7 (17.5554 ± 18.1356), MYOCD (8.4858 ± 10.0217), ENSG00000286393 (110.4646 ± 117.3685), AP005530.1 (27.3642 ± 19.1377), ARHGAP42P3 (6.1024 ± 9.9626), ENSG00000285718 (21.8281 ± 31.1034)
Top hits female: DSC3 (32.3906 ± 30.0936), KCNN1 (88.0484 ± 85.9929), ARHGAP42P4 (17.9253 ± 26.1852), AC091133.4 (6.8506 ± 5.2496), ENSG00000285718 (21.8281 ± 31.1034), AC215217.1 (46.9531 ± 34.6266), BIRC7 (48.4507 ± 72.3358), RN7SL399P (10.4446 ± 11.1398), NOVA1 (9.4550 ± 9.1818), AK7 (17.5554 ± 18.1356)
Top median male: DSC3 (32.3906 ± 30.0936), ACVR1C (6.0061 ± 6.3475), CLIC5 (55.2540 ± 62.4287), HAP1 (82.2007 ± 93.8839), ENSG00000285718 (21.8281 ± 31.1034), LINC01416 (11.8168 ± 22.7435), NOVA1 (9.4550 ± 9.1818), KCNN1 (88.0484 ± 85.9929), EP

In [38]:
data = pd.read_csv("../data/genes_extra_data.csv", delimiter=",", decimal='.')
for subtype in subtypes:
  aggregated_data = data.copy()
  aggregated_data = aggregated_data[aggregated_data["subtype"] == subtype]
  aggregated_data = aggregated_data[["sex", "subtype", *genes_by_subtype[subtype]]]
  aggregated_data = aggregated_data.groupby(["subtype", "sex"]).agg(["mean", "std"])
  aggregated_data.to_csv(f"aggregated_data_{subtype}.csv")