In [1]:
import yaml

In [2]:
with open('config/config.yaml', 'r') as f:
    conf = yaml.safe_load(f)['scripts']['downloadgse.py']

conf['query']

"Classes == 2 & Platform == 'GPL570'"

In [3]:
# Load infoml and configure data directory
from infoml import CONFIG
from infoml.binf.data import CuMiDa
from tqdm.auto import tqdm

CONFIG.datadir('data/raw/');
CONFIG.tempdir('infoml/cumida')

PosixPath('/tmp/infoml/cumida')

In [40]:
# Initialize CuMiDa
cumida = CuMiDa(datadir = CONFIG.tempdir('infoml/cumida'))

# Select datasets from the CuMiDa index
selected = (cumida.index.query("Classes == 2 & Platform == 'GPL570'")
            .index.tolist())

# Download datasets
cumida.download(selected)

Downloading GSEs:   0%|          | 0/21 [00:00<?, ?it/s]

Downloading GPLs:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
import infoml
infoml.__version__

'0.8.0'

In [43]:
"_".join(dataset)

'GSE12452_Throat'

In [41]:
# Process datasets for DGE analysis in R
with tqdm(total=len(selected), desc="Processing files...") as pbar:
    for dataset in selected:
       fpath = CONFIG.datadir() / ('_'.join(dataset) + '.tsv')

       gse = (cumida.load(dataset, probe_ids=True)
              .reset_index()
              .drop(columns=['samples'])
              .reset_index(names=['sample']))
       gse['sample'] = 'S' + (gse['sample'] + 1).astype(str)
       gse['type'] = gse['type'].apply(lambda x: 'normal' if 'normal' in x else 'tumor')

       exprs = (gse.drop(columns=['type'])
         .set_index('sample')
         .rename_axis('').T)
       # exprs.to_csv('exprs.tsv', sep='\t', index=True)
       
       data = (gse[['sample', 'type']]
         .set_index('sample')
         .rename_axis(''))
       # data.to_csv('data.tsv', sep='\t', index=True)

       print(exprs.describe().loc[['mean', 'std', 'min', 'max']].mean(axis=1))

Processing files...:   0%|          | 0/21 [00:00<?, ?it/s]

mean     4.056165
std      1.333683
min      1.889254
max     14.768116
dtype: float64
mean     4.747963
std      2.067425
min      1.541930
max     14.456886
dtype: float64
mean     4.851929
std      1.954816
min      2.015681
max     14.370782
dtype: float64
mean     4.812192
std      2.254571
min      1.828447
max     14.056843
dtype: float64
mean     5.414222
std      2.094265
min      2.500440
max     14.504771
dtype: float64
mean     5.919155
std      2.178486
min      2.800013
max     14.777527
dtype: float64
mean     5.468892
std      2.265790
min      1.926583
max     14.329605
dtype: float64
mean     4.390342
std      2.119111
min      1.288677
max     13.563845
dtype: float64
mean     5.674218
std      2.223300
min      2.221231
max     14.665155
dtype: float64
mean     5.294868
std      2.100098
min      1.958571
max     14.248572
dtype: float64
mean     5.950935
std      2.245573
min      2.422611
max     14.816525
dtype: float64
mean     6.429329
std      2.152544
min    

In [7]:
# Delete temporary files
import shutil
shutil.rmtree(CONFIG.tempdir())