In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

#Visualization Tools

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from scipy.stats import ttest_ind

In [2]:
# Get the data
rnaseq_df = pd.read_csv('../../data/RNASeq/SKCM_DATA_RNASeq.csv', index_col = 0)

# Preprocess genomic data
X = rnaseq_df.T
X.index = [index[0:12] for index in X.index.values]
X.index.names = ['submitter_id']
print(X.shape)
# Remove Duplicates
X = X[~X.index.duplicated(keep='first')]
print(X.shape)
X.head()

(473, 19947)
(469, 19947)


Unnamed: 0_level_0,A1BG,A2M,NAT1,NAT2,RP11-986E7.7,AADAC,AAMP,AANAT,AARS,ABAT,...,ULK4P1,WASH2P,FRG2C,ZNF605,RP11-55K22.5,RASAL2-AS1,LINC00882,FTX,TICAM2,SLC25A5-AS1
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-W3-AA21,504.5047,3941.9447,39.1032,0.0,1131.3869,0.5214,3113.1387,0.5214,5009.9062,8.8634,...,0.0,950.0782,0.0,171.0115,0.0,22.4192,6.2565,11.4703,93.7696,22.4192
TCGA-ER-A19F,244.8919,16349.6542,84.4794,0.0,160.3143,0.0,2684.4794,0.0,4754.4204,11.002,...,0.3929,280.7937,1.5717,251.0806,0.0,19.0177,0.7859,19.6464,126.9548,27.112
TCGA-BF-AAP4,336.4169,17552.2154,56.5111,0.0,90.4996,0.0,3449.2219,2.457,4372.6454,147.8296,...,0.0,396.0033,1.638,158.8862,0.0,161.1548,0.819,13.104,74.009,36.036
TCGA-EE-A2MF,331.2836,47838.7504,45.9054,0.0,2724.2845,0.0,2550.0142,0.0,5274.2987,277.9824,...,0.0,296.0924,0.2834,415.4151,0.0,16.7186,7.9343,10.2012,387.7274,33.7206
TCGA-EE-A2MJ,56.9041,22344.0675,92.219,0.4117,112.3919,0.4117,2033.3471,0.8234,2368.4644,190.2017,...,0.0,659.9876,0.4117,481.6797,0.0,13.9975,9.0572,50.2264,111.7867,43.6394


In [3]:
clinical_df = pd.read_csv('../../data/Clinical/SKCM_DATA_Clinical.csv', index_col = 0)
clinical_df.set_index('submitter_id',inplace=True)
print(clinical_df.shape)
clinical_df.head()

(470, 73)


Unnamed: 0_level_0,synchronous_malignancy,ajcc_pathologic_stage,tumor_stage,days_to_diagnosis,created_datetime,last_known_disease_status,tissue_or_organ_of_origin,days_to_last_follow_up,primary_diagnosis,age_at_diagnosis,...,treatments_radiation_days_to_treatment_end,treatments_radiation_days_to_treatment_start,treatments_radiation_regimen_or_line_of_therapy,treatments_radiation_treatment_effect,treatments_radiation_initial_disease_status,treatments_radiation_treatment_anatomic_site,treatments_radiation_treatment_outcome,bcr_patient_barcode,disease,sample_type
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-BF-A1PZ,No,Stage IIB,stage iib,0.0,,not reported,"Skin, NOS",853.0,"Malignant melanoma, NOS",26240.0,...,,,,,,,,TCGA-BF-A1PZ,SKCM,Primary Tumor
TCGA-HR-A5NC,No,,not reported,0.0,,not reported,"Skin, NOS",0.0,"Malignant melanoma, NOS",32872.0,...,,,,,,,,TCGA-HR-A5NC,SKCM,Primary Tumor
TCGA-EE-A183,No,Stage 0,stage 0,0.0,,not reported,"Skin, NOS",,"Malignant melanoma, NOS",17788.0,...,,,,,,,,TCGA-EE-A183,SKCM,Metastatic
TCGA-D3-A8GV,No,Not Reported,i/ii nos,0.0,,not reported,"Skin, NOS",,"Malignant melanoma, NOS",9344.0,...,,,,,,,,TCGA-D3-A8GV,SKCM,Metastatic
TCGA-W3-AA1R,No,Stage II,stage ii,0.0,,not reported,"Skin, NOS",,"Malignant melanoma, NOS",26145.0,...,,,,,,,,TCGA-W3-AA1R,SKCM,Metastatic


In [4]:
# Get RF genes
rf_features_df = pd.read_csv('../data/Melanoma_RF_weights_all_genomic_data.csv',index_col=0)

rf_features_df.head(10)

Unnamed: 0,weights
C7,0.159056
KRT17,0.102874
CLEC2A,0.091171
S100A7A,0.064637
KRTDAP,0.060437
WFDC5,0.041753
KRT6B,0.038876
S100A7,0.024198
KRT14,0.019587
PVRL4,0.017585


In [5]:
Y = clinical_df['sample_type'].to_frame()
Y = Y.replace({'Primary Tumor':0,'Metastatic':1, 'Additional Metastatic': 1})
Y = Y[Y.sample_type != 'Solid Tissue Normal']
X = X[rf_features_df.head(139).index.values].copy()
print(X.shape)
print(Y.shape)

(469, 139)
(469, 1)


In [6]:
genes_df = Y.merge(X, left_index=True, right_index=True)
print(genes_df.shape)
genes_df.head()

(468, 140)


Unnamed: 0_level_0,sample_type,C7,KRT17,CLEC2A,S100A7A,KRTDAP,WFDC5,KRT6B,S100A7,KRT14,...,AS3MT,NUDT7,AQP1,MRPL23,LGALS8,LINC00094,ADPRH,FAAH,CASK,RDH12
submitter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-BF-A1PZ,0,0.3056,12.2221,0.0,0.3056,0.0,0.0,18.9443,0.0,42.1664,...,223.9707,65.0829,632.4956,1553.7392,1288.7908,615.5649,366.053,18.0277,445.1914,33.9164
TCGA-HR-A5NC,0,51.7289,29855.5145,0.0,663.9485,2657.4995,241.0224,32441.9578,7999.2155,35849.2416,...,30.6963,86.4043,3228.7908,302.4149,427.4737,272.4235,493.4138,42.0652,1302.3169,468.9705
TCGA-EE-A183,1,2858.3243,3.2362,0.0,1.4383,0.3596,0.0,8.63,0.0,18.3387,...,3.5958,25.5304,4736.4509,1316.7925,264.5919,603.3585,645.4513,23.3729,342.3229,14.0237
TCGA-D3-A8GV,1,13.0572,33.0271,0.0,0.0,0.0,0.0,0.0,0.0,0.384,...,139.789,10.369,1110.6311,1708.1905,1049.1662,803.7136,423.5913,10.369,674.7506,1.9202
TCGA-W3-AA1R,1,27.0607,14.2425,0.0,0.0,0.0,0.0,4.9849,0.3561,4.2727,...,103.258,17.8031,255.2964,1113.0497,970.9489,380.2848,144.2051,6.0531,575.0401,9.6137


In [7]:
genes_level_exp_significance_df = pd.DataFrame(columns = ["t_stat","p_value"], index = rf_features_df.head(139).index.values)
genes_level_exp_significance_df

Unnamed: 0,t_stat,p_value
C7,,
KRT17,,
CLEC2A,,
S100A7A,,
KRTDAP,,
...,...,...
LINC00094,,
ADPRH,,
FAAH,,
CASK,,


In [8]:
for gene in rf_features_df.head(139).index.values:
    pt_df = genes_df[genes_df['sample_type'] == 0][gene]
    m_df  = genes_df[genes_df['sample_type'] == 1][gene]
    genes_level_exp_significance_df.loc[gene] = ttest_ind(pt_df, m_df, equal_var = False)   

In [9]:
genes_level_exp_significance_df.head(30)

Unnamed: 0,t_stat,p_value
C7,-8.19217,3.55926e-15
KRT17,2.79105,0.00627134
CLEC2A,3.44691,0.000800437
S100A7A,3.37602,0.00104157
KRTDAP,3.73229,0.000311908
WFDC5,3.93989,0.000149162
KRT6B,2.83104,0.00558951
S100A7,4.03363,0.000106194
KRT14,2.74054,0.00724276
PVRL4,4.78387,5.71065e-06


In [10]:
genes_level_exp_significance_df[genes_level_exp_significance_df.p_value < 0.05].index.values

array(['C7', 'KRT17', 'CLEC2A', 'S100A7A', 'KRTDAP', 'WFDC5', 'KRT6B',
       'S100A7', 'KRT14', 'PVRL4', 'SERPINB4', 'IL20RB', 'FKBP1B',
       'ZSWIM7', 'PRG2', 'PAX1', 'ZNF653', 'MMP3', 'ZNF593', 'VDAC1',
       'ADAMTSL3', 'RGS4', 'MRPL44', 'LYSMD2', 'TDRKH', 'PLA2G2F',
       'G6PC3', 'DOCK11', 'IGF1R', 'TSPAN14', 'RARRES2', 'GSR', 'FCER1A',
       'PSMD9', 'PDK4', 'PRKRIP1', 'HMG20B', 'TAF5L', 'RAX', 'FAM98C',
       'SLC40A1', 'COQ4', 'PSTPIP2', 'SSNA1', 'MRRF', 'PITHD1',
       'ARHGAP22', 'SCN4A', 'CLIC5', 'CICP27', 'SLC9A8', 'SMTNL2', 'XKRX',
       'FAM109B', 'C1orf159', 'SNAP23', 'MIEN1', 'RNF135', 'GTF2H2C',
       'AKR1B15', 'DDX3X', 'PABPC4L', 'TBC1D13', 'SMARCAL1', 'RPS28',
       'SPSB3', 'SAMD8', 'SWI5', 'CCPG1', 'ATP12A', 'CIB2', 'LCE1F',
       'ALAD', 'MRPL23', 'FAAH', 'CASK', 'RDH12'], dtype=object)

In [11]:
genes_level_exp_significance_df.to_csv("../data/genes_level_exp_significance.csv")