In [None]:
#import data
import numpy as np
DATA_FILENAME = 'data/GCM_Total.csv'
arr = np.genfromtxt(fname=DATA_FILENAME, delimiter='|', skip_header=1, dtype=float)
arr = arr[:, ~np.isnan(arr).any(axis=0)]
arr = np.transpose(arr)

In [None]:
# import classes
with open(DATA_FILENAME, 'r') as f:
    first_line = f.readline().strip().split('|')
set(first_line)
classes = [tuple(filter(None, cls.split('_')[0:3])) for cls in list(filter(None, first_line))]

# Normal is one class
classes = [cls if cls[0] == "Tumor" else ("Normal",'') for cls in classes]

In [None]:
def integerize(array):
    numDict = {}
    result = []
    for item in array:
        if item not in numDict:
            numDict[item] = len(numDict.keys())
        result.append(numDict[item])
    return result


In [None]:
# checking class counts
counter = 0
for cls in set(classes):
    counter += classes.count(cls)
    print(cls, classes.count(cls))

In [None]:
from sklearn.decomposition import PCA
#import matplotlib
#matplotlib.use('Gtk3Agg')
from matplotlib import pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D as plt3d

# pca = PCA(n_components=2)
pca = PCA(n_components=50 , whiten=True)
X = pca.fit_transform(arr)

#fig = plt.figure(dpi=1000, figsize=(15,10))
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:,2], c=integerize(classes))
X.shape

In [None]:
from sklearn.feature_selection import f_classif as funct
from sklearn.feature_selection import SelectKBest as selector

PERCENTILE = .001
NUM = 250

X = selector(funct,k=NUM).fit_transform(arr, integerize(classes))
#X_new = selector(funct,alpha=PERCENTILE).fit_transform(arr, integerize(classes))
#X_new = selector(funct).fit_transform(arr, integerize(classes))
X.shape

In [None]:
# devide data into training and testing
from sklearn import cross_validation
y_test = []
while len(set(y_test))  != len(set(classes)):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,integerize(classes),
                                                                     test_size=0.2, random_state=1)


In [5]:
from load_data import load_data
import numpy as np
train_values, train_values_rfe, train_classes, train_classes_binary, test_values, test_values_rfe, test_classes, test_classes_binary, class_desc\
    = load_data()
    
X_train = train_values
y_train = train_classes
X_test = test_values
y_test = test_classes

18
0
32
1
33
2
34
3
35
4
38
5
39
6
40
7
41
8
42
9
43
10
44
11
45
12
46
13
50
14
51
15
52
16
53
17
59
18
61
19
67
20
72
21
73
22
113
23
160
24
163
25
164
26
190
27
225
28
255
29
262
30
282
31
290
32
310
33
326
34
368
35
426
36
441
37
531
38
562
39
567
40
611
41
622
42
750
43
788
44
797
45
804
46
805
47
811
48
822
49
833
50
853
51
873
52
878
53
886
54
895
55
905
56
910
57
911
58
920
59
929
60
930
61
931
62
941
63
964
64
977
65
995
66
1023
67
1045
68
1048
69
1053
70
1056
71
1069
72
1081
73
1100
74
1108
75
1126
76
1156
77
1166
78
1220
79
1221
80
1222
81
1288
82
1303
83
1335
84
1340
85
1342
86
1358
87
1364
88
1369
89
1372
90
1375
91
1513
92
1549
93
1587
94
1655
95
1666
96
1667
97
1673
98
1676
99
1690
100
1693
101
1706
102
1707
103
1710
104
1734
105
1745
106
1762
107
1764
108
1765
109
1766
110
1770
111
1773
112
1774
113
1789
114
1794
115
1795
116
1798
117
1804
118
1828
119
1840
120
1844
121
1845
122
1846
123
1847
124
1851
125
1871
126
1879
127
1881
128
1891
129
1903
130
1929
131
1932
132
194

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.multiclass import OneVsRestClassifier as ova
from sklearn.neighbors import KNeighborsClassifier as knn

#Cs = [0.0001,0.001,0.01,0.1,1,2,5,10,15,20,50,100,1000]
#Cs = [50,100,200,500,1000]
Cs = [2,3,4,5,10,20,50,100]
#Cs = np.arange(140,160,2)
for c in Cs:
    # inclf = svm.SVC(kernel='linear', C=c)
    # inclf = svm.LinearSVC(C=c)
    inclf = knn(n_neighbors=c,n_jobs=-1)
    # clf = rfc(criterion='gini', max_features=None, n_estimators=c ,n_jobs=7)
    clf = ova(inclf)
    clf.fit(X_train,y_train).score(X_test,y_test)
    print(c,clf.score(X_test,y_test))

In [None]:
# adding feature selection
import time
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVR as SVR

#t_start = time.time()
estimator = SVR(C=10,max_iter=1000)
selector = RFE(estimator, 30, step=10, verbose=1)
selector.fit(X_train, y_train).score(X_test, y_test)
#t_end = time.time()
#t_end - t_start

In [None]:
# tuning PCA + SVM
from operator import itemgetter
Cs = [0.0001,0.001,0.01,0.1,1,2,5,10,15,20,50,100,1000]
maxes = []
for nComp in np.arange(1,100,5):
    pca = PCA(n_components=nComp , whiten=True)
    X = pca.fit_transform(arr)
    #print("Number of components: ", nComp)
    a = []
    y_test = []
    while len(set(y_test))  != len(set(classes)):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,integerize(classes),
                                                                     test_size=0.2, random_state=1)
    for c in np.arange(1,21,1):
        inclf = svm.SVC(kernel='linear', C=c)
        clf = ova(inclf)
        a.append((c,clf.fit(X_train,y_train).score(X_test,y_test)))
    maxes.append((nComp,max(a,key=itemgetter(1))[0],max(a,key=itemgetter(1))[1]))
max(maxes,key=itemgetter(2))

In [None]:
# k-NN + leave one out
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier as ova
from sklearn.cross_validation import LeaveOneOut

result = []
X = np.array(X_train)
y = np.array(y_train)
loo = LeaveOneOut(len(X))
print len(loo)
for c in [0.0001,0.001,0.01,0.1,1,2,5,10,15,20,50]:
    result = []
    for train_index, test_index in loo:
        x_tr, x_t, y_tr, y_t = X[train_index], X[test_index], y[train_index], y[test_index]
        result.append(ova(svm.SVC(kernel='linear', C=c)).fit(x_tr,y_tr).score(x_t,y_t))
    print c, sum(result)/len(loo)

In [None]:
# good stuff
svm.SVC(kernel='rbf', C=6, decision_function_shape='ovr') # on PCA to 3D gave 50%  
svm.SVC(kernel='rbf', C=14, decision_function_shape='ovr') # on PCA to 5D gave 57%
svm.SVC(kernel='rbf', C=16, decision_function_shape='ovr') # on PCA to 35D gave 76.78%

In [None]:
maxes