In [6]:
import pandas as pd
import numpy as np

from joblib import Parallel, delayed
from api_helpers import get_tcga_projects

In [7]:
def preprocess(df: pd.DataFrame):
    """ Preprocess the data by removing non-tumor samples, log transforming and standardizing."""
    
    # tumor types range from 01 - 09, normal types from 10 - 19 and control samples from 20 - 29
    df = df.loc[[int(x.split('-')[-1][:2]) < 10 for x in df.index]]

    # remove the part indicating tumor type from sample id
    df.index = df.index.str.split('-').str[:-1].str.join('-')
    
    # remove lowly expressed genes
    to_keep = np.percentile(df, 75, axis=0) >= 10
    df = df.loc[:, to_keep]

    # log transform
    df = np.log(df + 1)

    # standardize
    df = (df - df.mean(axis=0)) / df.std(axis=0)

    return df


def handle(project: str):

    file_path = f'data/raw/{project}.csv'

    # read expression data
    df = pd.read_csv(file_path, index_col=0)

    # read survival data
    df_survival = pd.read_csv(f'data/raw/{project}-survival.csv', index_col=0)
    # drop samples without survival data
    df_survival = df_survival.dropna()

    df = preprocess(df)
    df_merged = pd.merge(df_survival, df, left_index=True, right_index=True)

    # save preprocessed data
    df_merged.to_csv(f'data/{project}.csv')



# handle('TCGA-ACC')
tcga_projects = get_tcga_projects()
Parallel(n_jobs=len(tcga_projects))(delayed(handle)(project) for project in tcga_projects)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [8]:
import pandas as pd

pd.read_csv('data/TCGA-HNSC.csv')['time'].sort_values(ascending=False)

303    6417.0
331    5480.0
271    5252.0
330    5152.0
332    4856.0
        ...  
423      11.0
357      11.0
310       2.0
435       2.0
88        1.0
Name: time, Length: 519, dtype: float64