In [46]:
import pandas as pd
from sklearn import preprocessing, cross_validation
from sklearn import tree, metrics, neighbors, linear_model, svm, ensemble
from matplotlib import pyplot as plt

In [47]:
# sample barcode is unique and serves as ID
df = pd.read_csv("data/by_tissue_RNASeq/RNASeq_BRCA_labled.tsv", sep="\t", index_col="sample_barcode")
df.drop("short_barcode", axis=1, inplace=True)
print df.head(2)

                              ?|100130426  ?|100133144  ?|100134869  ?|10357  \
sample_barcode                                                                 
TCGA-3C-AAAU-01A-11R-A41B-07            0      16.3644      12.9316  52.1503   
TCGA-3C-AALI-01A-11R-A41B-07            0       9.2659      17.3790  69.7553   

                              ?|10431  ?|136542  ?|155060  ?|26823  ?|280660  \
sample_barcode                                                                 
TCGA-3C-AAAU-01A-11R-A41B-07  408.076         0  1187.010   0.0000    0.0000   
TCGA-3C-AALI-01A-11R-A41B-07  563.893         0   516.041   1.0875    0.5438   

                              ?|317712  ...   ZWINT|11130  ZXDA|7789  \
sample_barcode                          ...                            
TCGA-3C-AAAU-01A-11R-A41B-07         0  ...       931.957   129.5920   
TCGA-3C-AALI-01A-11R-A41B-07         0  ...       965.198    59.8151   

                              ZXDB|158586  ZXDC|79364  ZYG11A|440590 

In [52]:
def data_label_split(df):
    label = df['TP53']
    data = df.drop("TP53", axis=1)
    return data, label

def folds_to_split(data,targets,train,test):
    data_tr = pd.DataFrame(data).iloc[train]
    data_te = pd.DataFrame(data).iloc[test]
    labels_tr = pd.DataFrame(targets).iloc[train]
    labels_te = pd.DataFrame(targets).iloc[test]
    return [data_tr, data_te, labels_tr, labels_te]

In [53]:
data, label = data_label_split(df)
x_train, x_test, y_train, y_test = cross_validation.train_test_split(
    data, label, test_size=0.1, random_state=0)

In [54]:
def tenfold_cross_validation(x_train, y_train, classifiers):
    result_df = pd.DataFrame()
    foldnum = 0
    for train, val in cross_validation.KFold(len(x_train), shuffle=True, n_folds=10, random_state=0):
        foldnum += 1
        [tr_data, val_data, tr_targets, val_targets] = folds_to_split(x_train, y_train, train, val)
        tr_targets = tr_targets.as_matrix().ravel()
        val_targets = val_targets.as_matrix().ravel()

        for classfier_name, clf in classifiers.iteritems():
            clf.fit(tr_data, tr_targets)
            prediction = clf.predict(val_data)
            accuracy = metrics.accuracy_score(prediction, val_targets)
            result_df.loc[foldnum, classfier_name] = accuracy
    return result_df

In [55]:
Classifiers = {"Decision tree": tree.DecisionTreeClassifier()}
             #"Random Forest": ensemble.RandomForestClassifier(),
             #"KNN": neighbors.KNeighborsClassifier(),
             #"Logistic regression": linear_model.LogisticRegression(),
             #"SVM Linear": svm.SVC(kernel="linear"),
             #"SVM RBF": svm.SVC(kernel="rbf"),
             #"Ada Boost": ensemble.AdaBoostClassifier()}


In [56]:
print tenfold_cross_validation(x_train, y_train, Classifiers)

    Decision tree
1        0.709091
2        0.672727
3        0.800000
4        0.745455
5        0.709091
6        0.745455
7        0.715596
8        0.779817
9        0.660550
10       0.752294
