In [41]:
import numpy as np
import pandas as pd
import xenaPython as xena

In [42]:
df = pd.read_csv('../data/TARGET-NBL-Phenotype.tsv', sep='\t', encoding = "ISO-8859-1")

In [43]:
df = df.set_index('sample_id')

In [44]:
df_short = df.query('diagnostic_category=="Neuroblastoma" & \
                     inss_stage=="Stage 4" & \
                     cog_risk_group=="High Risk" & \
                     vital_status=="Dead" & \
                     overall_survival_time<730')

df_long = df.query('diagnostic_category=="Neuroblastoma" & \
                     inss_stage=="Stage 4" & \
                     cog_risk_group=="High Risk" & \
                     vital_status=="Alive" & \
                     overall_survival_time>2555')

In [45]:
hub = "https://gdc.xenahubs.net"
cohort = "GDC TARGET-NBL"
dataset = "TARGET-NBL.htseq_counts.tsv"

In [46]:
# Fetch the sample names in the dataset
samples=xena.dataset_samples (hub, dataset, None)
print("len(samples): {}".format(len(samples)))

len(samples): 151


In [47]:
ss = [s for s in samples if s in df_short.index.values]
sl = [s for s in samples if s in df_long.index.values]

In [48]:
# NB. 
print("len(ss): {}, len(sl): {}".format(len(ss),len(sl)))

len(ss): 20, len(sl): 12


In [32]:
# Create phenotype file
columns = ['MKI', 'diagnostic_category', 'inss_stage', 'cog_risk_group', 
           'Event Free Survival Time in Days', 'overall_survival_time',
           'Age at Diagnosis in Days', 'Year of Diagnosis', 
           'Year of Last Follow Up', 'vital_status']
pheno = pd.concat([df.loc[ss, columns], df.loc[sl, columns]])

pheno.to_csv("../outputs/phenotype.tsv", sep='\t')

In [13]:
probes = xena.dataset_field(hub, dataset)

In [14]:
# Because of connection timeout, fetch the values of probes by batch
def divide_probes_to_batches(num_probes, limit=10000):
    num_batches = num_probes // limit
    batches = [limit for i in range(num_batches)]
    if len(batches) == 0:
        batches.append(num_probes)
    else:
        remainder = num_probes - sum(batches)
        if remainder > 0:
            batches.append(remainder)
    return batches

In [15]:
# Fetch counts from xena db
def dataset_fetch(probes, samples, prefix=''):
    counts = xena.dataset_fetch(hub, dataset, samples, probes)
    df = pd.DataFrame.from_records(counts)
    df.index.name = 'Probes'
    df.index = probes
    if prefix != '':
        samples = [prefix + str(i+1) for i in range(len(samples))]
    df.columns = samples
    
    return df

In [16]:
df_ss_counts = pd.DataFrame()
df_sl_counts = pd.DataFrame()
batches = divide_probes_to_batches(len(probes),limit=2000)
i = 0
for batch in batches:
    # fetch ss batch probes counts
    df_ss_batch = dataset_fetch(probes[i:i+batch],ss,prefix='ss')
    df_ss_counts = pd.concat([df_ss_counts,df_ss_batch])
    # fetch sl batch probes counts
    df_sl_batch = dataset_fetch(probes[i:i+batch],sl,prefix='sl')
    df_sl_counts = pd.concat([df_sl_counts,df_sl_batch])
    i += batch

In [53]:
df_ss_sl = pd.concat([df_ss_counts, df_sl_counts], axis=1, join='inner')
# Add col name to index
df_ss_sl.index.name = 'ensembl_Id'
# Drop the last 6 rows (Are not counts)
df_ss_sl.drop(df_ss_sl.tail(6).index,inplace=True)
df_ss_sl.to_csv("../outputs/log-counts.tsv", sep='\t')

In [61]:
def valuation_formula(x):
    for i, v in enumerate(x):
        #if i == 0: continue
        x[i] = round(2**v) -1
    return x

In [62]:
def norm_counts_to_raw(df):
    df = df.apply(lambda row: valuation_formula(row), axis=1)
    return df

In [63]:
df = pd.read_csv('../outputs/log-counts.tsv', sep='\t', encoding = "ISO-8859-1")
df = df.set_index('ensembl_Id')

In [64]:
df.head()

Unnamed: 0_level_0,ss1,ss2,ss3,ss4,ss5,ss6,ss7,ss8,ss9,ss10,...,sl3,sl4,sl5,sl6,sl7,sl8,sl9,sl10,sl11,sl12
ensembl_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.13,11.92,10.51,11.16,10.56,10.1,10.13,10.66,10.44,10.13,8.185,...,11.11,10.01,10.61,11.53,9.783,10.68,9.028,10.2,10.2,10.68
ENSG00000000005.5,2.585,0.0,0.0,3.459,4.17,2.0,4.087,1.585,3.907,2.322,...,2.322,1.585,3.322,4.17,0.0,3.322,2.585,1.585,3.0,7.476
ENSG00000000419.11,10.33,12.05,10.42,11.29,10.27,10.62,10.29,10.77,11.54,11.32,...,11.04,10.4,10.51,10.69,9.087,10.54,9.889,10.21,10.73,10.66
ENSG00000000457.12,10.67,9.39,9.931,10.89,8.986,10.08,9.801,9.674,10.18,9.901,...,8.907,9.449,10.5,10.83,9.581,10.51,8.96,9.653,9.925,9.633
ENSG00000000460.15,8.555,8.248,9.382,10.61,8.629,9.132,9.276,9.017,9.353,9.917,...,8.524,6.895,9.986,9.276,9.098,8.957,6.781,8.762,8.755,8.644


In [65]:
df = norm_counts_to_raw(df)

In [66]:
df.to_csv("../outputs/raw-counts.csv", sep='\t')

In [67]:
df.head()

Unnamed: 0_level_0,ss1,ss2,ss3,ss4,ss5,ss6,ss7,ss8,ss9,ss10,...,sl3,sl4,sl5,sl6,sl7,sl8,sl9,sl10,sl11,sl12
ensembl_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003.13,3874.0,1457.0,2287.0,1509.0,1096.0,1120.0,1617.0,1388.0,1120.0,290.0,...,2209.0,1030.0,1562.0,2956.0,880.0,1640.0,521.0,1175.0,1175.0,1640.0
ENSG00000000005.5,5.0,0.0,0.0,10.0,17.0,3.0,16.0,2.0,14.0,4.0,...,4.0,2.0,9.0,17.0,0.0,9.0,5.0,2.0,7.0,177.0
ENSG00000000419.11,1286.0,4239.0,1369.0,2503.0,1234.0,1573.0,1251.0,1745.0,2977.0,2556.0,...,2105.0,1350.0,1457.0,1651.0,543.0,1488.0,947.0,1183.0,1697.0,1617.0
ENSG00000000457.12,1628.0,670.0,975.0,1897.0,506.0,1081.0,891.0,816.0,1159.0,955.0,...,479.0,698.0,1447.0,1819.0,765.0,1457.0,497.0,804.0,971.0,793.0
ENSG00000000460.15,375.0,303.0,666.0,1562.0,395.0,560.0,619.0,517.0,653.0,966.0,...,367.0,118.0,1013.0,619.0,547.0,496.0,109.0,433.0,431.0,399.0


In [72]:
t_df = pd.DataFrame()
t_df['id'] = df.columns

In [73]:
condition = []
samples = []
i, j = 0, 0
for rep in t_df['id']:
    if rep.startswith("ss"):
        condition.append('S')
        samples.append(ss[i])
        i += 1
    else:
        condition.append('L')
        samples.append(sl[j])
        j += 1

In [74]:
t_df['conditionName'] = condition
t_df['sampleId'] = samples

In [75]:
# this file contains conditionName variable needed for the DGE analysis
t_df.to_csv('../outputs/pData.txt', sep='\t', encoding="ISO-8859-1", index=False)