In [1]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
load_dotenv('.env')
load_dotenv('../.env')
from sksurv.util import Surv

# def helper_get_training_genes():
#     # get a list of 996 genes used for training RNA-Seq models
#     df = pd.read_csv(os.environ.get("GENEEXPRESSIONFILE"),sep='\t')
#     gene_ids = [s.split('Feature_exp_')[1] for s in df.filter(regex='Feature_exp').columns]
#     return gene_ids

# gene_ids = helper_get_training_genes()

def parse_global_clinsurv_df():
    return pd.read_csv(os.environ.get("EXTERNALCLINDATAFILE"), sep=',')\
        .sort_values(by='Patient')\
        .rename(columns={'Patient':'PUBLIC_ID'})\
        .set_index('PUBLIC_ID')

global_clinsurv_df = parse_global_clinsurv_df()

def parse_global_clindf():
    df = global_clinsurv_df\
        [['Study','D_Age','D_Gender','D_ISS']]\
        .convert_dtypes('D_ISS',int)\
        .assign(D_Age=lambda df: df['D_Age'])
    
    df = pd.get_dummies(df,columns=['D_ISS'],dtype=int)\
        .assign(D_male=lambda df: df['D_Gender'].map({'Male': 1, 'Female': -1}).fillna(0).astype(int))\
        .drop(columns='D_Gender')

    df.columns = df.columns\
        .str.replace('D_','Feature_clin_D_PT_')\
        .str.replace('Age','age')\
        .str.replace('ISS','iss') # consistency with CoMMpass clinical column names
    '''
    PUBLIC_ID [index]
    Feature_clin_D_PT_age
    Feature_clin_D_PT_iss_1
    Feature_clin_D_PT_iss_2
    Feature_clin_D_PT_iss_3
    Feature_clin_D_PT_male
    '''
    return df

global_clindf=parse_global_clindf()

def parse_clin_helper(studyname):
    return global_clindf.query(f"Study==\"{studyname}\"").drop(columns='Study')

def parse_surv_helper(studyname,endpoint):
    # returns a structured array required by sksurv linear models fit function
    ENDPOINT=str.upper(endpoint)
    studydf=global_clinsurv_df.query(f"Study==\"{studyname}\"")[[f'D_{ENDPOINT}_FLAG',f'D_{ENDPOINT}']]
    return Surv.from_dataframe(f'D_{ENDPOINT}_FLAG',f'D_{ENDPOINT}',studydf)

# def parse_exp_helper(dotenvfilename):
#     # used for calculating indices
#     try: # GSE24080UAMS or HOVON65
#         df = pd.read_csv(os.environ.get(dotenvfilename),sep=',').sort_values('Accession')
#     except: # EMTAB-4032
#         df = pd.read_csv(os.environ.get(dotenvfilename),sep='\t').sort_values('Accession')
#     df = df\
#         .rename(columns={'Accession':'PUBLIC_ID'})\
#         .set_index('PUBLIC_ID')
#     gene_id_hits = [g for g in df.columns if g in gene_ids] # 904 genes
#     gene_id_miss = [g for g in gene_ids if g not in gene_id_hits] # 92 genes
#     df_hits = df[gene_id_hits]
#     df_miss = pd.DataFrame(pd.NA,index=df.index,columns=gene_id_miss)
#     df = pd.concat([df_hits,df_miss],axis=1)[gene_ids]
#     df = df.rename(columns=lambda x: f'Feature_exp_{x}')
#     return df

# for VAE RNA-Seq model
def parse_clin_uams():
    return parse_clin_helper("GSE24080UAMS")
def parse_clin_hovon():
    return parse_clin_helper("HOVON65")
def parse_clin_emtab():
    return parse_clin_helper("EMTAB4032")

def parse_exp_uams():
    return parse_exp_helper("UAMSDATAFILE")
def parse_exp_hovon():
    return parse_exp_helper("HOVONDATAFILE")
def parse_exp_emtab():
    return parse_exp_helper("EMTABDATAFILE")

def parse_surv_uams(endpoint):
    return list(zip(*parse_surv_helper("GSE24080UAMS",endpoint)))
def parse_surv_hovon(endpoint):
    return list(zip(*parse_surv_helper("HOVON65",endpoint)))
def parse_surv_emtab(endpoint):
    return list(zip(*parse_surv_helper("EMTAB4032",endpoint)))

# for PCA RNA-Seq model
def parse_exp_pc_uams():
    return pd.read_csv(os.environ.get("UAMSPCGENEEXPRESSIONFILE"),sep='\t',index_col=0)
    
def parse_exp_pc_hovon():
    return pd.read_csv(os.environ.get("HOVONPCGENEEXPRESSIONFILE"),sep='\t',index_col=0)
    
def parse_exp_pc_emtab():
    return pd.read_csv(os.environ.get("EMTABPCGENEEXPRESSIONFILE"),sep='\t',index_col=0)

In [2]:
def helper_get_training_genes(endpoint,shuffle,fold):
    # read the significant genes
    scratchdir="/scratch/users/nus/e1083772/cancer-survival-ml/data/splits"
    features_file=f'{scratchdir}/{shuffle}/{fold}/valid_features_processed.parquet'
    features=pd.read_parquet(features_file)
    columns = features.filter(regex='Feature_exp').columns
    genes = columns.str.extract('.*Feature_exp_(ENSG.*)$').iloc[:,0].values.tolist()
    return genes

In [3]:
def geo_mean(iterable):
    a = np.array(iterable)
    return a.prod()**(1.0/len(a))

In [4]:
def parse_exp_helper(dotenvfilename,genes,level="ensembl"):
    ref = globals()['ref']
    geo_mean = globals()['geo_mean']
    assert ref is not None
    assert geo_mean is not None
    if level=="ensembl":
        try: # GSE24080UAMS or HOVON65
            df = pd.read_csv(os.environ.get(dotenvfilename),sep=',').sort_values('Accession')
        except: # EMTAB-4032
            df = pd.read_csv(os.environ.get(dotenvfilename),sep='\t').sort_values('Accession')
        # ensembl gene IDs
        # used for calculating indices
        df = df.rename(columns={'Accession':'PUBLIC_ID'}).set_index('PUBLIC_ID')
        gene_id_hits = [g for g in df.columns if g in genes] # 904 genes
        gene_id_miss = [g for g in genes if g not in gene_id_hits] # 92 genes
        df_hits = df[gene_id_hits]
        df_miss = pd.DataFrame(pd.NA,index=df.index,columns=gene_id_miss)
        df = pd.concat([df_hits,df_miss],axis=1)[genes]
        if not df.columns[0].startswith('Feature_exp'):
            df = df.rename(columns=lambda x: f'Feature_exp_{x}')
        return df
    elif level=="entrez":
        train_ref = ref[ref.ensembl_gene_id.isin(genes)][['ensembl_gene_id','entrezgene_id']].drop_duplicates()
        df = pd.read_csv(os.environ.get(dotenvfilename),index_col=0)
        df.index.name = 'entrezgene_id'
        df.reset_index(inplace=True)
        df_ensg = df.merge(train_ref,on='entrezgene_id').drop_duplicates().drop(columns=['entrezgene_id']).set_index('ensembl_gene_id').transpose()
        df_ensg_missing = pd.DataFrame(pd.NA, index=df_ensg.index, columns=train_ref.ensembl_gene_id[~train_ref.ensembl_gene_id.isin(df_ensg.columns)])
        df_ensg_full = pd.concat([df_ensg, df_ensg_missing],axis=1)[genes]
        return df_ensg_full
    elif level=="affy":
        df = pd.read_csv(os.environ.get("UAMSDATAFILE"),index_col=0)
        df.index.name='affy_hg_u133_plus_2'
        train_ref = ref[ref.ensembl_gene_id.isin(train_genes)][['ensembl_gene_id','affy_hg_u133_plus_2']].drop_duplicates()
        df_ensg = df.merge(train_ref,on='affy_hg_u133_plus_2').drop_duplicates().drop(columns=['affy_hg_u133_plus_2'])
        public_ids = df_ensg.filter(regex='^(?!ensembl_gene_id)').columns
        df_ensg = df_ensg.groupby('ensembl_gene_id')[public_ids].agg(geo_mean)
        df_ensg = df_ensg.transpose()
        df_ensg.index.name='PUBLIC_ID'
        df_ensg_missing = pd.DataFrame(pd.NA, index=df_ensg.index, columns=train_ref.ensembl_gene_id[~train_ref.ensembl_gene_id.isin(df_ensg.columns)])
        df_ensg_full = pd.concat([df_ensg, df_ensg_missing],axis=1)
        return df_ensg_full
    else:
        raise Exception(f"{level} is not a supported ID system")

In [5]:
# conversion between ensgid, entrez gene id, and affy probe id
ref = pd.read_csv('/home/users/nus/e1083772/cancer-survival-ml/data/reference/ensembl_entrez_affy.csv',index_col=0).convert_dtypes()

In [6]:
train_genes = helper_get_training_genes('os',0,0)

In [7]:
uams_exp = parse_exp_helper("UAMSDATAFILE",level="ensembl",genes=train_genes)

In [8]:
uams_exp.isna().all().value_counts()

True     550
False    163
Name: count, dtype: int64

In [6]:
uams_exp_entrez = parse_exp_helper("UAMSDATAFILE",level="entrez",genes=train_genes)

In [9]:
uams_exp_entrez.isna().all().value_counts()

True     541
False    173
Name: count, dtype: int64

In [9]:
uams_exp_affy = parse_exp_helper("UAMSDATAFILE",level="affy",genes=train_genes)

In [10]:
uams_exp_affy.isna().all().value_counts()

True    1001
Name: count, dtype: int64

## Todo

1. compare which gives most genes: entrez gene level, or probe level, or previously my own ensembl gene level

2. write function to get PUBLIC_ID by ENSG df for training genes, with NA columns replaced with 0

3. Either train new gene expression VAEs on those 100-200 genes only, or use the full df on the VAE

In [None]:
df_ensg