In [3]:
##importing packages 
import numpy as np
import csv
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
from plotly.graph_objs import Scatter, Layout


In [4]:
##functions 
#function for reading data
def readFile(entity, strip=None):
    with open(entity,'r') as f:
        data = np.asarray([l.strip(strip).split('\t') for l in f])
    return data


In [5]:
#function for matching data
def match(seq1, seq2):
    '''Finds the index locations of seq1 in seq2'''
    return [ np.nonzero(seq2==x)[0][0] for x in seq1  if x in seq2 ]


In [6]:
#function for analyzing data
def analyze(testLabels,trainLabels,data,survival,repetingTimes):
    
    #printout test label
    print('Test sets :\n')
    print(testLabels)
    print('\n')

    #printout train label
    print('Train sets :\n')
    print(trainLabels)
    print('\n')

    #printout input data of target set
    print('Input sets :\n')
    print(data)
    print('\n')

    #printout output data of target set
    print('Output sets :\n')
    print(survival)
    print('\n')

    #filtering the data
    samples=data[1:,0]
    data=data[1:,1:].astype(np.float).T
    survTime = survival[1:,1].astype(np.int)
    survStatus = survival[1:,2].astype(np.int)
 
    ##Invoking the R-language by rpy2 interphase
    %load_ext rpy2.ipython
    
    #importing necessary R-packages
    %R require(survival); require(randomForestSRC); require(survcomp)
    
    #building a list space for all models
    predictionSets = []
    
    #building a list space for all concordances
    concordanceSet = []
    
    #building models and their C-indices   
    print('\n')    
    print('Results :')
    print('\n')
    for bootstrapIdx in range(repetingTimes):

        #building a list space for a model
        predictions = []

        #searching and memorizing the indices of each train- or test-set in each sample name
        trainIdx = match(trainLabels[:,bootstrapIdx], samples)   
        testIdx = match(testLabels[:,bootstrapIdx], samples)

        #verifying that all the memorized indices corresponds to the right sample name data
        assert (np.all(trainLabels[:,bootstrapIdx]==samples[trainIdx]) and 
                np.all(testLabels[:,bootstrapIdx]==samples[testIdx]))

        #Exctracting train and test data sets
        trainData = data[:, trainIdx].T
        trainSurvStatus = survStatus[trainIdx]
        trainSurvTime = survTime[trainIdx]
        testData = data[:, testIdx].T
        testSurvStatus = survStatus[testIdx]
        testSurvTime = survTime[testIdx]
              
        #pushing train and test data sets to R-language for modeling and predicting
        %Rpush trainData trainSurvStatus trainSurvTime testData testSurvStatus testSurvTime

        #modeling by random survival forest
        %R rsf.model.fit <- rfsrc(Surv(time,status)~., data=data.frame(time=trainSurvTime, status=trainSurvStatus, trainData), ntree=1000, na.action='na.impute', splitrule='logrank', nsplit=1, seed=1, outcome = 'test')
    
        #predictions made by built model above
        %R -o predictedResponse predictedResponse <- predict(rsf.model.fit, data.frame(testData), na.action='na.impute')$predicted
        
        #evaluating the model by the made predictions above through the C-index
        %R -o concordance concordance <- concordance.index(predictedResponse, testSurvTime, testSurvStatus)$c.index
        
        #adjusting the type of data for printing
        concordanceSet.append(np.asarray(concordance).T)
        predictions.append(predictedResponse)
        predictions = np.asarray(predictions).T
        predictionSets.append(predictions)
        
    #printing the predicting results from built-models    
    print ('Predictions from Models :') 
    print('\n')
    predictionSets = np.asarray(predictionSets).T
    print (predictionSets[0])
    print('\n')
    
    #printing the C-index results from built-models   
    print ('C-index set :') 
    print('\n')
    concordanceSet = np.asarray(concordanceSet).T
    print(concordanceSet[0])
    print('\n')
    #returing the C-index sets from repeting
    return concordanceSet[0]


In [7]:
##importing and analyzing files
y1 = analyze(readFile('Data/gbm_test_sample_list.txt'),readFile('Data/gbm_train_sample_list.txt'),readFile('Data/Core_set/GBM/gbm_cnv_core.txt'),readFile('Data/Core_set/GBM/gbm_os_core.txt'),100)


Test sets :

[['TCGA-41-2571' 'TCGA-32-4719' 'TCGA-27-2519' ..., 'TCGA-19-2623'
  'TCGA-14-1459' 'TCGA-32-2495']
 ['TCGA-02-2483' 'TCGA-41-4097' 'TCGA-12-0821' ..., 'TCGA-14-2554'
  'TCGA-14-0783' 'TCGA-06-2570']
 ['TCGA-14-1453' 'TCGA-14-1453' 'TCGA-06-0882' ..., 'TCGA-32-2491'
  'TCGA-14-1825' 'TCGA-14-1454']
 ..., 
 ['TCGA-16-1045' 'TCGA-14-0786' 'TCGA-19-1388' ..., 'TCGA-16-1060'
  'TCGA-12-0820' 'TCGA-06-0875']
 ['TCGA-15-1449' 'TCGA-19-1385' 'TCGA-06-2566' ..., 'TCGA-14-1452'
  'TCGA-41-4097' 'TCGA-41-2572']
 ['TCGA-32-1977' 'TCGA-06-2565' 'TCGA-41-3393' ..., 'TCGA-12-0826'
  'TCGA-06-0881' 'TCGA-41-4097']]


Train sets :

[['TCGA-14-0865' 'TCGA-14-1794' 'TCGA-32-1982' ..., 'TCGA-27-1833'
  'TCGA-14-0867' 'TCGA-28-1749']
 ['TCGA-19-2629' 'TCGA-27-1833' 'TCGA-15-1447' ..., 'TCGA-41-2575'
  'TCGA-28-1753' 'TCGA-19-0962']
 ['TCGA-12-1091' 'TCGA-28-1755' 'TCGA-28-2502' ..., 'TCGA-27-2527'
  'TCGA-02-2486' 'TCGA-19-2624']
 ..., 
 ['TCGA-12-1098' 'TCGA-19-2620' 'TCGA-14-0783' ..., 'TCG


Loading required package: randomForestSRC



Loading required package: survcomp



Error in rfsrc(Surv(time, status) ~ ., data = data.frame(time = trainSurvTime,  : 
  沒有這個函數 "rfsrc"



此外: 








Error in predict(rsf.model.fit, data.frame(testData), na.action = "na.impute") : 
  找不到物件 'rsf.model.fit'



Error in concordance.index(predictedResponse, testSurvTime, testSurvStatus) : 
  沒有這個函數 "concordance.index"




NameError: name 'concordance' is not defined

In [None]:
##plotting BOXPLOT by Plotly
#selecting the style of labels
x_data = ['cnv']
y_data = [y1]
colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)']
traces = []


In [None]:
#selecting the style of layout
layout = go.Layout(
    title='C-index in GBM Cancer Types from Different Training Data Sets by Raindom Survival Forest',
    yaxis=dict(
        autorange=True,
        showgrid=True,
        zeroline=True,
        dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=dict(
        l=40,
        r=30,
        b=80,
        t=100,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
    showlegend=False
)


In [None]:
#plotting the BOXPLOT for data
for xd, yd, cls in zip(x_data, y_data, colors):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            fillcolor=cls,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))


In [None]:

#demonstrating the plot
fig = go.Figure(data=traces, layout=layout)
plotly.offline.init_notebook_mode(connected=True)
plotly.offline.plot(fig)

In [2]:
from rpy2.robjects.packages import importr
base = importr('base')
print(base.R_home())


[1] "/Users/pepe/anaconda3/envs/aia/lib/R"



In [1]:
%load_ext rpy2.ipython


In [None]:
import os
os.environ['R_HOME'] = '/Library/Frameworks/R.framework/Resources'


In [6]:
%%R
require(randomForestSRC)
# randomForestSRC::rfsrc()
# randomForestSRC


Error in rfsrc() : 沒有這個函數 "rfsrc"



