In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from sklearn.feature_selection import VarianceThreshold

In [2]:
clinical_df = pd.read_csv('../data/clinical_data_skcm_MDB to joma.csv')
clinical_df.set_index('id',inplace=True)

# Remove 1 entry for Solid Tissue Normal
clinical_df.drop('MDB-GN-A4U8-11',inplace=True)

nan_value = float("NaN")

clinical_df.replace("", nan_value, inplace=True)
clinical_df.dropna(subset = 
          ["age_at_initial_pathologic_diagnosis",
           "gender",
           "pathologic_M",
           "pathologic_N",
           "pathologic_T",
           "person_neoplasm_cancer_status",
           "vital_status",
           "pathologic_stage",
           "radiation_therapy"
          ], 
          inplace=True)
clinical_df.drop([
    "days_to_death",
    "melanoma_clark_level_value",
    "interferon_90_day_prior_excision_admin_indicator"
],1,inplace=True)

print(clinical_df.shape)

clinical_df.head()

(390, 100)


Unnamed: 0_level_0,_EVENT,_INTEGRATION,_OS,_OS_IND,_PATIENT,_RFS,_RFS_IND,_TIME_TO_EVENT,_anatomical_origin,_cohort,...,_GENOMIC_ID_MDB_SKCM_GSNP6noCNV,_GENOMIC_ID_MDB_SKCM_mutation_ucsc_vcf_gene,_GENOMIC_ID_MDB_SKCM_exp_HiSeqV2,_GENOMIC_ID_MDB_SKCM_mutation_broad_gene,_GENOMIC_ID_MDB_SKCM_GSNP6raw,_GENOMIC_ID_MDB_SKCM_exp_HiSeqV2_PANCAN,_GENOMIC_ID_MDB_SKCM_hMethyl450,_GENOMIC_ID_MDB_SKCM_gistic2,_GENOMIC_ID_MDB_SKCM_exp_HiSeqV2_exon,_GENOMIC_ID_MDB_SKCM_miRNA_HiSeq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MDB-EE-A2GE-06,0.0,MDB-EE-A2GE-06,4898.0,0.0,MDB-EE-A2GE,4898.0,0.0,4898.0,Skin,MDB Melanoma,...,09081350-d32c-45d1-a049-7cef323c4828,MDB-EE-A2GE-06A-11D-A196-08,fec63465-f926-4fce-a88a-898cac637305,MDB-EE-A2GE-06A-11D-A196-08,09081350-d32c-45d1-a049-7cef323c4828,fec63465-f926-4fce-a88a-898cac637305,MDB-EE-A2GE-06A-11D-A19B-05,MDB-EE-A2GE-06A-11D-A194-01,fec63465-f926-4fce-a88a-898cac637305,MDB-EE-A2GE-06A-11R-A18W-13
MDB-ER-A193-06,1.0,MDB-ER-A193-06,955.0,1.0,MDB-ER-A193,,,955.0,Skin,MDB Melanoma,...,28a7d729-7555-4545-924b-3dec49b54230,MDB-ER-A193-06A-12D-A197-08,33f43961-b32d-46fc-ba11-264f1101e78d,MDB-ER-A193-06A-12D-A197-08,28a7d729-7555-4545-924b-3dec49b54230,33f43961-b32d-46fc-ba11-264f1101e78d,MDB-ER-A193-06A-12D-A19C-05,MDB-ER-A193-06A-12D-A191-01,33f43961-b32d-46fc-ba11-264f1101e78d,MDB-ER-A193-06A-12R-A18V-13
MDB-EB-A3Y6-01,0.0,MDB-EB-A3Y6-01,126.0,0.0,MDB-EB-A3Y6,126.0,0.0,126.0,Skin,MDB Melanoma,...,d4de0385-6e04-4a44-9351-d0d8fc85085f,MDB-EB-A3Y6-01A-21D-A23B-08,263de7ad-d70e-4e10-b406-1d2e368a99b2,MDB-EB-A3Y6-01A-21D-A23B-08,d4de0385-6e04-4a44-9351-d0d8fc85085f,263de7ad-d70e-4e10-b406-1d2e368a99b2,MDB-EB-A3Y6-01A-21D-A23D-05,,263de7ad-d70e-4e10-b406-1d2e368a99b2,MDB-EB-A3Y6-01A-21R-A23A-13
MDB-D3-A5GO-06,0.0,MDB-D3-A5GO-06,4195.0,0.0,MDB-D3-A5GO,,,4195.0,Skin,MDB Melanoma,...,49c61e05-73b4-4c22-afb9-895a931e2d07,MDB-D3-A5GO-06A-12D-A27K-08,1d1a1d5e-fdfe-4251-93ec-c89b65bb341e,MDB-D3-A5GO-06A-12D-A27K-08,49c61e05-73b4-4c22-afb9-895a931e2d07,1d1a1d5e-fdfe-4251-93ec-c89b65bb341e,MDB-D3-A5GO-06A-12D-A27L-05,MDB-D3-A5GO-06A-12D-A27J-01,1d1a1d5e-fdfe-4251-93ec-c89b65bb341e,
MDB-BF-A3DN-01,0.0,MDB-BF-A3DN-01,717.0,0.0,MDB-BF-A3DN,717.0,0.0,717.0,Skin,MDB Melanoma,...,a06fecd1-567c-4c18-bfa2-5d058967cf04,MDB-BF-A3DN-01A-11D-A20D-08,bbdc1eec-bfa5-4be6-ac9b-80426e1f7fdd,MDB-BF-A3DN-01A-11D-A20D-08,a06fecd1-567c-4c18-bfa2-5d058967cf04,bbdc1eec-bfa5-4be6-ac9b-80426e1f7fdd,MDB-BF-A3DN-01A-11D-A211-05,,bbdc1eec-bfa5-4be6-ac9b-80426e1f7fdd,MDB-BF-A3DN-01A-11R-A20E-13


In [3]:
Y = clinical_df[[
    "sample_type",
    "age_at_initial_pathologic_diagnosis",
    "gender",
    "pathologic_M",
    "pathologic_N",
    "pathologic_T",
    "pathologic_stage",
    "percent_lymphocyte_infiltration_TOP",
    "percent_monocyte_infiltration_TOP",
    "percent_neutrophil_infiltration_TOP",
    "person_neoplasm_cancer_status",
    "radiation_therapy",
    "vital_status"
]]
Y.head()

Unnamed: 0_level_0,sample_type,age_at_initial_pathologic_diagnosis,gender,pathologic_M,pathologic_N,pathologic_T,pathologic_stage,percent_lymphocyte_infiltration_TOP,percent_monocyte_infiltration_TOP,percent_neutrophil_infiltration_TOP,person_neoplasm_cancer_status,radiation_therapy,vital_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
MDB-EE-A2GE-06,Metastatic,44.0,MALE,M0,N0,T2,Stage I,0,0,0,TUMOR FREE,YES,LIVING
MDB-ER-A193-06,Metastatic,62.0,MALE,M0,N0,T3b,Stage IIB,0,0,0,WITH TUMOR,NO,DECEASED
MDB-EB-A3Y6-01,Primary Tumor,56.0,FEMALE,M0,N0,T4b,Stage IIC,0,0,0,TUMOR FREE,NO,LIVING
MDB-D3-A5GO-06,Metastatic,61.0,MALE,M0,N0,T4,Stage II,0,0,0,TUMOR FREE,NO,LIVING
MDB-BF-A3DN-01,Primary Tumor,81.0,FEMALE,M0,N3,T3b,Stage IIIC,0,0,0,TUMOR FREE,NO,LIVING


In [4]:
percent_missing = Y.isnull().sum() * 100 / len(Y)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df

Unnamed: 0,percent_missing
sample_type,0.0
age_at_initial_pathologic_diagnosis,0.0
gender,0.0
pathologic_M,0.0
pathologic_N,0.0
pathologic_T,0.0
pathologic_stage,0.0
percent_lymphocyte_infiltration_TOP,0.0
percent_monocyte_infiltration_TOP,0.0
percent_neutrophil_infiltration_TOP,0.0


In [5]:
rnaseq_df = pd.read_csv('../data/MDB_SKCM_RNASeq to joma.csv')
rnaseq_df.set_index('id',inplace=True)

# Remove 1 entry for Solid Tissue Normal
rnaseq_df.drop('MDB-GN-A4U8-11',1,inplace=True)

# Get Gene Expression of Patients in clinical_df
rnaseq_df = rnaseq_df[clinical_df.index]

print(rnaseq_df.shape)
rnaseq_df.head()

(20530, 390)


Unnamed: 0_level_0,MDB-EE-A2GE-06,MDB-ER-A193-06,MDB-EB-A3Y6-01,MDB-D3-A5GO-06,MDB-BF-A3DN-01,MDB-GF-A2C7-01,MDB-D3-A3C3-06,MDB-WE-A8ZN-06,MDB-D3-A8GI-06,MDB-D9-A4Z6-06,...,MDB-GN-A4U4-06,MDB-EE-A29B-06,MDB-FR-A44A-06,MDB-FR-A2OS-01,MDB-EB-A5VV-06,MDB-GF-A4EO-06,MDB-EE-A29V-06,MDB-EB-A44R-06,MDB-D3-A51E-06,MDB-D3-A3MR-06
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ARHGEF10L,9.4,10.42,9.44,10.6,9.42,8.29,8.49,10.35,8.35,8.36,...,7.54,9.51,10.1,8.16,7.43,10.07,9.05,9.57,9.1,9.88
HIF3A,3.02,2.91,2.62,0.886,2.81,2.05,8.69,2.48,1.87,3.66,...,3.79,3.26,0.9703,2.75,5.61,6.39,1.41,10.3,2.31,2.93
RNF17,0.0,0.0,0.0,0.0,0.0,1.19,0.8303,0.0,0.0,0.8891,...,1.65,0.0,2.63,0.0,0.0,0.4204,0.0,0.0,1.48,0.0
RNF10,11.96,12.26,11.19,11.63,11.57,11.76,12.33,11.72,11.38,12.04,...,11.73,11.98,11.36,11.3,11.97,11.75,11.76,12.12,11.57,12.21
RNF11,10.01,11.37,10.59,11.45,10.99,10.82,10.16,9.21,9.42,10.15,...,10.84,9.28,10.78,11.1,10.26,10.62,9.75,9.89,11.61,10.12


In [6]:
X = rnaseq_df.T
X.head()

id,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,REM1,MTVR2,RTN4RL2,...,TULP2,NPY5R,GNGT2,GNGT1,TULP3,PTRF,BCL6B,GSTK1,SELP,SELS
MDB-EE-A2GE-06,9.4,3.02,0.0,11.96,10.01,10.56,12.94,9.24,0.6512,5.07,...,0.0,0.1482,5.39,1.87,9.5,13.4,9.58,11.68,7.0,10.73
MDB-ER-A193-06,10.42,2.91,0.0,12.26,11.37,9.89,12.48,4.71,0.0,6.07,...,0.3278,1.71,4.41,0.0,9.58,13.9,6.95,11.83,7.51,11.22
MDB-EB-A3Y6-01,9.44,2.62,0.0,11.19,10.59,10.77,12.26,4.32,0.0,6.32,...,0.3533,0.0,3.49,1.66,9.07,11.4,6.73,11.73,1.66,10.71
MDB-D3-A5GO-06,10.6,0.886,0.0,11.63,11.45,10.81,12.2,3.05,0.0,6.99,...,0.0,0.0,5.01,4.04,10.06,13.63,6.25,11.36,5.4,10.43
MDB-BF-A3DN-01,9.42,2.81,0.0,11.57,10.99,11.02,12.63,5.2,0.3427,7.21,...,1.16,0.0,3.44,0.0,9.51,13.03,7.95,11.45,7.07,10.07


In [12]:
corr = X.corr().abs().astype('float16')
upper_tri = corr.where(np.triu(np.ones(corr.shape),k=1).astype(np.bool)).astype('float16')

In [13]:
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
to_drop

['SNORD114-31',
 'SNORD114-30',
 'SNORD115-6',
 'SNORD115-4',
 'SNORD104',
 'KRTAP10-11',
 'KRTAP10-10',
 'KRTAP2-4',
 'SNORD115-9',
 'SNORD38A',
 'KRTAP19-2',
 'KRTAP19-4',
 'SNORD114-3',
 '?|441362',
 'OR5R1',
 'HBII-52-27',
 'SNORD11',
 'SNORD12',
 'SNORD21',
 'SNORD20',
 'SNORD23',
 'SNORD25',
 'SNORD127',
 'SNORD124',
 'TTTY17A',
 'SNORD115-5',
 'SNORA14A',
 'SNORD80',
 'IL2RG',
 'SNORD56B',
 'SNORD78',
 'SNORD113-7',
 'HSFX1',
 'SNORD96A',
 'KRT71',
 'SASH3',
 'SNORD119',
 'HBII-52-46',
 'HBII-52-45',
 'SNORD47',
 'SNORD4B',
 'SNORD4A',
 'CYorf15A',
 'SNORD116-24',
 'SNORD116-25',
 'SNORD116-23',
 'SNORD116-29',
 'KRTDAP',
 'DEFB121',
 'SNORD115-8',
 'SNORD67',
 'SNORD66',
 'HBII-52-28',
 'SNORD69',
 'SNORD68',
 'SNORD38B',
 'SNORD113-4',
 'SNORD116-26',
 'OR5H15',
 'OR5H14',
 'SNORD115-1',
 'SNORD58A',
 'SNORD58C',
 'OR5B17',
 'PRR20A',
 'PRR20D',
 'OR6X1',
 'OR5H1',
 'SNORD96B',
 'SNORD63',
 'SNORD116-22',
 'SNORD81',
 'MRGPRG',
 'SERPINB4',
 'IVL',
 'SPRR1A',
 'PRAMEF3',
 'SIR

In [14]:
len(to_drop)

468

In [15]:
X.drop(to_drop, axis=1, inplace=True)

In [18]:
threshold_n = 0.95
sel = VarianceThreshold(threshold=(threshold_n* (1 - threshold_n) ))
sel_var=sel.fit_transform(X)
X_low_variance = X[X.columns[sel.get_support(indices=True)]] 

In [24]:
set(X.columns)-set(X.columns[sel.get_support(indices=True)])

{'12-Sep',
 '?|100130426',
 '?|404770',
 '?|442388',
 '?|728603',
 'AAA1',
 'ACMSD',
 'ACSM4',
 'ACTL7A',
 'ACTL9',
 'ACTRT1',
 'ACTRT2',
 'ADAM18',
 'ADAM2',
 'ADAM5P',
 'ADAM7',
 'AFM',
 'AKAP4',
 'AKR1CL1',
 'AMELY',
 'AOX2P',
 'APOBEC1',
 'ASAP1IT1',
 'ASB18',
 'ATOH1',
 'ATXN8OS',
 'BANF2',
 'BARHL1',
 'BASE',
 'BCYRN1',
 'BEYLA',
 'BHLHE23',
 'BIRC8',
 'BMP10',
 'BMP15',
 'BPIL3',
 'BSX',
 'C10orf113',
 'C10orf120',
 'C10orf40',
 'C10orf53',
 'C10orf96',
 'C11orf40',
 'C11orf94',
 'C13orf28',
 'C14orf165',
 'C14orf177',
 'C14orf183',
 'C14orf53',
 'C15orf43',
 'C16orf78',
 'C16orf92',
 'C17orf73',
 'C18orf20',
 'C18orf62',
 'C1orf100',
 'C1orf141',
 'C1orf146',
 'C1orf185',
 'C20orf173',
 'C20orf185',
 'C20orf71',
 'C20orf79',
 'C21orf131',
 'C21orf54',
 'C22orf33',
 'C22orf42',
 'C2orf27B',
 'C2orf83',
 'C3orf22',
 'C3orf24',
 'C3orf77',
 'C4orf11',
 'C4orf17',
 'C4orf35',
 'C4orf45',
 'C5orf48',
 'C6orf146',
 'C6orf221',
 'C7orf66',
 'C8orf71',
 'C8orf74',
 'C9orf144',
 'C9orf2

In [25]:
print(len(set(X.columns)-set(X.columns[sel.get_support(indices=True)])))

872


In [26]:
df = Y.merge(X_low_variance,left_index=True, right_index=True)
df.head()

Unnamed: 0,sample_type,age_at_initial_pathologic_diagnosis,gender,pathologic_M,pathologic_N,pathologic_T,pathologic_stage,percent_lymphocyte_infiltration_TOP,percent_monocyte_infiltration_TOP,percent_neutrophil_infiltration_TOP,...,TULP2,NPY5R,GNGT2,GNGT1,TULP3,PTRF,BCL6B,GSTK1,SELP,SELS
MDB-EE-A2GE-06,Metastatic,44.0,MALE,M0,N0,T2,Stage I,0,0,0,...,0.0,0.1482,5.39,1.87,9.5,13.4,9.58,11.68,7.0,10.73
MDB-ER-A193-06,Metastatic,62.0,MALE,M0,N0,T3b,Stage IIB,0,0,0,...,0.3278,1.71,4.41,0.0,9.58,13.9,6.95,11.83,7.51,11.22
MDB-EB-A3Y6-01,Primary Tumor,56.0,FEMALE,M0,N0,T4b,Stage IIC,0,0,0,...,0.3533,0.0,3.49,1.66,9.07,11.4,6.73,11.73,1.66,10.71
MDB-D3-A5GO-06,Metastatic,61.0,MALE,M0,N0,T4,Stage II,0,0,0,...,0.0,0.0,5.01,4.04,10.06,13.63,6.25,11.36,5.4,10.43
MDB-BF-A3DN-01,Primary Tumor,81.0,FEMALE,M0,N3,T3b,Stage IIIC,0,0,0,...,1.16,0.0,3.44,0.0,9.51,13.03,7.95,11.45,7.07,10.07


In [27]:
df['sample_type'].value_counts(dropna=False)

Metastatic               311
Primary Tumor             78
Additional Metastatic      1
Name: sample_type, dtype: int64

In [29]:
dummy_clinical_df = df[[    
    "gender",
    "pathologic_M",
    "pathologic_N",
    "pathologic_T",
    "pathologic_stage",
    "person_neoplasm_cancer_status",
    "radiation_therapy",
    "vital_status"
]]
dummy_clinical_df = pd.get_dummies(dummy_clinical_df,columns=[    
    "gender",
    "pathologic_M",
    "pathologic_N",
    "pathologic_T",
    "pathologic_stage",
    "person_neoplasm_cancer_status",
    "radiation_therapy",
    "vital_status"
],drop_first=True)
dummy_clinical_df.head(10)

Unnamed: 0,gender_MALE,pathologic_M_M1,pathologic_M_M1a,pathologic_M_M1b,pathologic_M_M1c,pathologic_N_N1,pathologic_N_N1a,pathologic_N_N1b,pathologic_N_N2,pathologic_N_N2a,...,pathologic_stage_Stage IIB,pathologic_stage_Stage IIC,pathologic_stage_Stage III,pathologic_stage_Stage IIIA,pathologic_stage_Stage IIIB,pathologic_stage_Stage IIIC,pathologic_stage_Stage IV,person_neoplasm_cancer_status_WITH TUMOR,radiation_therapy_YES,vital_status_LIVING
MDB-EE-A2GE-06,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
MDB-ER-A193-06,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
MDB-EB-A3Y6-01,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
MDB-D3-A5GO-06,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
MDB-BF-A3DN-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
MDB-GF-A2C7-01,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
MDB-D3-A3C3-06,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
MDB-WE-A8ZN-06,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
MDB-D3-A8GI-06,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
MDB-D9-A4Z6-06,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0


In [30]:
dummy_clinical_df.columns

Index(['gender_MALE', 'pathologic_M_M1', 'pathologic_M_M1a',
       'pathologic_M_M1b', 'pathologic_M_M1c', 'pathologic_N_N1',
       'pathologic_N_N1a', 'pathologic_N_N1b', 'pathologic_N_N2',
       'pathologic_N_N2a', 'pathologic_N_N2b', 'pathologic_N_N2c',
       'pathologic_N_N3', 'pathologic_N_NX', 'pathologic_T_T1',
       'pathologic_T_T1a', 'pathologic_T_T1b', 'pathologic_T_T2',
       'pathologic_T_T2a', 'pathologic_T_T2b', 'pathologic_T_T3',
       'pathologic_T_T3a', 'pathologic_T_T3b', 'pathologic_T_T4',
       'pathologic_T_T4a', 'pathologic_T_T4b', 'pathologic_T_TX',
       'pathologic_T_Tis', 'pathologic_stage_Stage 0',
       'pathologic_stage_Stage I', 'pathologic_stage_Stage IA',
       'pathologic_stage_Stage IB', 'pathologic_stage_Stage II',
       'pathologic_stage_Stage IIA', 'pathologic_stage_Stage IIB',
       'pathologic_stage_Stage IIC', 'pathologic_stage_Stage III',
       'pathologic_stage_Stage IIIA', 'pathologic_stage_Stage IIIB',
       'pathologic_stage_

In [32]:
Y = df['sample_type'].to_frame()
Y = Y.replace({'Primary Tumor':0,'Metastatic':1, 'Additional Metastatic': 1})
X = df.drop('sample_type',1)
X = pd.get_dummies(X,columns=[    
    "gender",
    "pathologic_M",
    "pathologic_N",
    "pathologic_T",
    "pathologic_stage",
    "person_neoplasm_cancer_status",
    "radiation_therapy",
    "vital_status"
],drop_first=True)

In [33]:
Y.head()

Unnamed: 0,sample_type
MDB-EE-A2GE-06,1
MDB-ER-A193-06,1
MDB-EB-A3Y6-01,0
MDB-D3-A5GO-06,1
MDB-BF-A3DN-01,0


In [34]:
X.head()

Unnamed: 0,age_at_initial_pathologic_diagnosis,percent_lymphocyte_infiltration_TOP,percent_monocyte_infiltration_TOP,percent_neutrophil_infiltration_TOP,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,...,pathologic_stage_Stage IIB,pathologic_stage_Stage IIC,pathologic_stage_Stage III,pathologic_stage_Stage IIIA,pathologic_stage_Stage IIIB,pathologic_stage_Stage IIIC,pathologic_stage_Stage IV,person_neoplasm_cancer_status_WITH TUMOR,radiation_therapy_YES,vital_status_LIVING
MDB-EE-A2GE-06,44.0,0,0,0,9.4,3.02,0.0,11.96,10.01,10.56,...,0,0,0,0,0,0,0,0,1,1
MDB-ER-A193-06,62.0,0,0,0,10.42,2.91,0.0,12.26,11.37,9.89,...,1,0,0,0,0,0,0,1,0,0
MDB-EB-A3Y6-01,56.0,0,0,0,9.44,2.62,0.0,11.19,10.59,10.77,...,0,1,0,0,0,0,0,0,0,1
MDB-D3-A5GO-06,61.0,0,0,0,10.6,0.886,0.0,11.63,11.45,10.81,...,0,0,0,0,0,0,0,0,0,1
MDB-BF-A3DN-01,81.0,0,0,0,9.42,2.81,0.0,11.57,10.99,11.02,...,0,0,0,0,0,1,0,0,0,1


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=10)

In [36]:
y_train['sample_type'].value_counts(dropna=False)

1    218
0     55
Name: sample_type, dtype: int64

In [37]:
y_test['sample_type'].value_counts(dropna=False)

1    94
0    23
Name: sample_type, dtype: int64

In [38]:
print(X_train.shape)
print(y_train.shape)

(273, 19238)
(273, 1)


In [39]:
print(X_test.shape)
print(y_test.shape)

(117, 19238)
(117, 1)


In [40]:
train_df = y_train.merge(X_train,left_index=True, right_index=True)
train_df.head()

Unnamed: 0,sample_type,age_at_initial_pathologic_diagnosis,percent_lymphocyte_infiltration_TOP,percent_monocyte_infiltration_TOP,percent_neutrophil_infiltration_TOP,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,...,pathologic_stage_Stage IIB,pathologic_stage_Stage IIC,pathologic_stage_Stage III,pathologic_stage_Stage IIIA,pathologic_stage_Stage IIIB,pathologic_stage_Stage IIIC,pathologic_stage_Stage IV,person_neoplasm_cancer_status_WITH TUMOR,radiation_therapy_YES,vital_status_LIVING
MDB-ER-A19T-01,0,51.0,0,0,0,11.22,7.11,0.0,11.28,11.26,...,0,0,0,0,0,0,1,1,1,0
MDB-ER-A196-01,0,64.0,0,0,0,9.17,2.87,0.0,11.76,10.81,...,0,1,0,0,0,0,0,0,0,1
MDB-GN-A26A-06,1,63.0,2,1,0,8.86,4.64,0.0,11.5,10.57,...,0,0,0,1,0,0,0,1,0,0
MDB-D3-A8GI-06,1,68.0,0,0,0,8.35,1.87,0.0,11.38,9.42,...,0,0,0,0,0,0,0,1,0,0
MDB-EE-A2A6-06,1,43.0,35,1,0,9.44,3.19,0.0,11.81,10.5,...,0,0,0,0,0,0,0,0,0,1


In [41]:
test_df = y_test.merge(X_test,left_index=True, right_index=True)
test_df.head()

Unnamed: 0,sample_type,age_at_initial_pathologic_diagnosis,percent_lymphocyte_infiltration_TOP,percent_monocyte_infiltration_TOP,percent_neutrophil_infiltration_TOP,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,...,pathologic_stage_Stage IIB,pathologic_stage_Stage IIC,pathologic_stage_Stage III,pathologic_stage_Stage IIIA,pathologic_stage_Stage IIIB,pathologic_stage_Stage IIIC,pathologic_stage_Stage IV,person_neoplasm_cancer_status_WITH TUMOR,radiation_therapy_YES,vital_status_LIVING
MDB-ER-A2NG-06,1,43.0,3,2,0,9.71,1.05,0.0,11.83,10.02,...,0,0,0,0,0,1,0,1,0,1
MDB-ER-A3ET-06,1,64.0,0,0,0,10.13,2.78,0.0,11.53,11.08,...,0,0,0,1,0,0,0,1,0,1
MDB-WE-A8ZX-06,1,45.0,0,0,0,8.89,8.04,0.0,11.0,9.82,...,0,0,0,0,1,0,0,0,1,1
MDB-EB-A6QZ-01,0,76.0,3,0,0,10.24,1.52,0.3677,11.71,10.47,...,0,0,0,0,0,0,0,1,1,0
MDB-EE-A29B-06,1,67.0,2,1,0,9.51,3.26,0.0,11.98,9.28,...,1,0,0,0,0,0,0,1,0,0


In [44]:
train_df.to_csv('../data/train_unresampled.csv')
test_df.to_csv('../data/test_unresampled.csv')