In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import pandas as pd

import get_geo
import kraft
import name_biology

In [None]:
directory_path = os.path.expanduser("~/Downloads")

overwrite = False

## [Intertumoral Heterogeneity within Medulloblastoma Subgroups](https://www.cell.com/cancer-cell/fulltext/S1535-6108(17)30201-5)

[GSE85218](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85218) is a SuperSeries containing [GSE85212](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85212) (methylation data) and [GSE85217](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85217) (RNA data).

8's SOFT file has the information values for both 2 and 7, but the sample values for only 7.
We do not read the information values from this SOFT file, because we read them from the paper's supplementary table 2, which has richer information for 2 and 7.
(However, this table uses blue and red to represent 0 and 1, so we had to manually fix it and save it as mmc2.tsv before reading it.)
Also, we do not read 7's sample values from this SOFT file because we read them from 7.
Overall, we do not read from 8.

2's SOFT file has the information values, but is missing the sample values, which are in its supplementary file.
So we read 2's sample values from its supplementary file.

7's SOFT file has the information values and the sample values.
But these sample values have different sample names from the 2's sample values.
7 also has a supplementary file with the sample values.
And these sample values have the same sample names with the 2's sample values.
For consistency and for matching the sample names, we read 7's sample values from its supplementary file.

In [None]:
information_x_sample = kraft.tidy(pd.read_csv("mmc2.tsv", sep="\t", index_col=0,).T)

information_x_sample.index.name = "Information"

information_x_sample

In [None]:
methylation_gene_x_sample = pd.read_csv(
    kraft.download(
        "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE85nnn/GSE85212/suppl/GSE85212_Methylation_763samples_SubtypeStudy_TaylorLab_beta_values.txt.gz",
        directory_path,
        overwrite=overwrite,
    ),
    sep="\t",
)

methylation_gene_x_sample.index = methylation_gene_x_sample.index.map(
    name_biology.ILMNID_GENE
)

methylation_gene_x_sample.index.name = "Gene"

methylation_gene_x_sample = kraft.tidy(
    kraft.group(methylation_gene_x_sample.loc[~methylation_gene_x_sample.index.isna()])
)

methylation_gene_x_sample

In [None]:
rna_gene_x_sample = pd.read_csv(
    kraft.download(
        "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE85nnn/GSE85217/suppl/GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab.txt.gz",
        directory_path,
        overwrite=overwrite,
    ),
    sep="\t",
    index_col=0,
).iloc[:, 4:]

rna_gene_x_sample.index = rna_gene_x_sample.index.map(name_biology.ENS_GENE)

rna_gene_x_sample.index.name = "Gene"

rna_gene_x_sample = kraft.tidy(
    kraft.group(rna_gene_x_sample.loc[~rna_gene_x_sample.index.isna()])
)

rna_gene_x_sample

## [Subgroup-specific structural variation across 1,000 medulloblastoma genomes](https://www.nature.com/articles/nature11327)

[GSE37385](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE37385) is a SuperSeries containing [GSE37382](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE37382) (RNA data) and [GSE37384](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE37384) (copy number data).

5's SOFT file is 18GB because it contains 4, which is also 18GB.

Because of 4's large size, we do not read 4 and 5.

We only read the information values and the sample values from 2's SOFT file.

In [None]:
gse_id = "GSE37385"

In [None]:
information_x_samples, rna_gene_x_sample = get_geo.get_gse(
    gse_id, directory_path, overwrite=overwrite
)

In [None]:
gse_id = "GSE37382"

In [None]:
information_x_samples, rna_gene_x_sample = get_geo.get_gse(
    gse_id, directory_path, overwrite=overwrite
)

In [None]:
gse_id = "GSE37384"

In [None]:
information_x_samples, rna_gene_x_sample = get_geo.get_gse(
    gse_id, directory_path, overwrite=overwrite
)

## [Novel molecular subgroups for clinical classification and outcome prediction in childhood medulloblastoma: a cohort study](https://www.thelancet.com/journals/lanonc/article/PIIS1470-2045(17)30243-7/fulltext)

[GSE93646](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE93646) has methylation data.

Its SOFT file has only the information values but not the sample values, which are in its supplementary file. 

So we read the information values from its SOFT file and the sample values from its supplementary file.

In [None]:
gse_id = "GSE93646"

In [None]:
_x_samples = get_geo.get_gse(gse_id, directory_path, overwrite=overwrite)

In [None]:
methylation_gene_x_sample = pd.read_csv(
    kraft.download(
        "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE93nnn/GSE93646/suppl/GSE93646_processed_data.txt.gz",
        directory_path,
        overwrite=overwrite,
    ),
    sep="\t",
    index_col=1,
).iloc[:, 1::2]

methylation_gene_x_sample.index = methylation_gene_x_sample.index.map(
    name_biology.ILMNID_GENE
)

methylation_gene_x_sample.index.name = "Gene"

methylation_gene_x_sample = kraft.tidy(
    kraft.group(methylation_gene_x_sample.loc[~methylation_gene_x_sample.index.isna()])
)

methylation_gene_x_sample

## [Multiple recurrent genetic events converge on control of histone lysine methylation in medulloblastoma](https://www.nature.com/articles/ng.336)

[GSE14437](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE14437) has copy number data from 4 platforms.

We read the information values and the sample values from its SOFT file.

In [None]:
gse_id = "GSE14437"

In [None]:
_x_samples = get_geo.get_gse(gse_id, directory_path, overwrite=overwrite)