In [7]:
import pandas as pd
import numpy as np

In [9]:
# Counts file (Register and Download the file from: http://camda2017.bioinf.jku.at/doku.php/)
# The original file name is "GSE49711_SEQC_NB_MAV_G_log2.20121127.txt"."
data = pd.read_csv('../data/camda.txt', sep='\t')

In [10]:
# drop un-needed columns
data.drop(columns=['#NCBI GeneId','#RefSeq transcript Id','#Chromosome','#Strand','#from base','#to base',
                   '#Title','#Measured object'],inplace=True)
data=data.set_index('#Gene')

In [11]:
# Phenotype file. Certain rownames were renamed
gs = pd.read_csv("../data/GSE49711_series_matrix.txt", sep=',')
gs = gs.T

In [12]:
gs.reset_index(drop=False,inplace=True)
gs.columns = gs.iloc[0]
gs.drop(0, inplace=True)
gs.reset_index(drop=True,inplace=True)
gs = gs.set_index('Sample_title')
gs.shape

(498, 14)

In [13]:
# Query for high risk samples
gs = gs.query('Tissue=="tissue: neuroblastoma" & \
               Inss_stage=="inss stage: 4" & \
               High_risk=="high risk: 1"')

In [14]:
ss = []
ls = []
for i, r in gs.iterrows():
    os_d = int(r['Os_day'].split(':')[1].strip())
    if os_d < 730:
        dt = r['Death_from_disease'].split(':')[1].strip()
        if dt == '1':
            ss.append(i)
    elif os_d > 2555:
        dt = r['Death_from_disease'].split(':')[1].strip()
        if dt == '0':
            ls.append(i)

In [15]:
print('len(ss): {}, len(ls): {}'.format(len(ss), len(ls)))

len(ss): 42, len(ls): 19


In [16]:
# Create test-set
tp = pd.read_csv('../outputs/dge/top_up.csv', sep=',', encoding = "ISO-8859-1")
dw = pd.read_csv('../outputs/dge/top_down.csv', sep=',', encoding = "ISO-8859-1")

In [17]:
df = pd.DataFrame()
df['sample_id'] = ss + ls
df = df.set_index('sample_id')

In [18]:
for i, b in enumerate(tp['symbol'].notnull()):
    if b:
        gene = tp['symbol'].iloc[i]
        if gene in data.index:
            df[gene] = data.loc[gene, df.index]
        
for i, b in enumerate(dw['symbol'].notnull()):
    if b:
        gene = dw['symbol'].iloc[i]
        if gene in data.index:
            df[gene] = data.loc[gene, df.index]

In [19]:
target = []
for s in ss:
    target.append(0)
for s in ls:
    target.append(1)

df['target'] = target

In [20]:
df.head()

Unnamed: 0_level_0,EVX2,NHLH2,PRSS12,POU6F2,HOXD10,MAPK15,RTL1,LGR5,DPY19L2P4,STRA6,...,NBAS,HIST1H1E,CRYAB,NXPH3,MYL3,CMYA5,AMIGO2,EDIL3,UBC,target
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SEQC_NB005,11.96,13.4,17.92,0.0,16.16,9.62,10.8,13.39,12.39,15.54,...,14.6,7.14,16.46,7.71,11.44,14.34,9.21,15.45,14.41,0
SEQC_NB013,0.0,0.0,9.99,8.97,11.17,9.08,14.82,11.0,9.7,10.78,...,14.97,7.94,15.47,14.87,9.88,10.03,15.35,13.32,15.2,0
SEQC_NB242,13.75,15.14,17.53,5.26,13.35,5.1,10.92,8.42,7.3,14.78,...,15.2,10.8,11.89,10.88,3.73,7.85,8.96,9.04,14.06,0
SEQC_NB244,7.6,11.03,10.24,6.29,8.91,9.23,16.9,7.94,10.08,11.76,...,14.51,2.93,17.14,13.84,10.59,11.09,15.92,16.97,15.37,0
SEQC_NB255,0.0,5.63,9.8,6.95,3.23,10.19,12.65,9.07,10.56,10.89,...,15.81,9.16,13.54,11.19,9.06,9.62,12.86,14.68,14.84,0


In [21]:
df.to_csv("../outputs/camda-test-set.csv")