In [1]:
# Load infoml and configure data directory
from infoml import CONFIG
from infoml.binf.data import CuMiDa
from tqdm.auto import tqdm

CONFIG.datadir('data/raw/');

In [3]:
CONFIG.tempdir('infoml/cumida')

PosixPath('/tmp/infoml/cumida')

In [4]:
# Initialize CuMiDa
cumida = CuMiDa(datadir = CONFIG.tempdir('infoml/cumida'))

# Select datasets from the CuMiDa index
selected = (cumida.index.query("Classes == 2 & Platform == 'GPL570'")
            .index.tolist())[:2]

# Download datasets
cumida.download(selected)

Downloading GSEs:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading GPLs:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# Process datasets for DGE analysis in R
with tqdm(total=len(selected), desc="Processing files...") as pbar:
    for dataset in selected:
        fpath = CONFIG.datadir() / ('_'.join(dataset) + '.tsv')

        gse = (cumida.load(dataset)
               .reset_index()
               .drop(columns=['samples']))
        gse['type'] = gse['type'].apply(lambda x: 'normal' if 'normal' in x else 'tumor')
        gse = (gse.set_index('type').T
               .reset_index(names=['Gene']))
        gse['Gene'] = gse['Gene'].str.split('.').str[0]

        gse.to_csv(fpath, sep='\t', index=False)

Processing files...:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Delete temporary files
import shutil
shutil.rmtree(CONFIG.tempdir())