In [1]:
import numpy as np
import csv
from sklearn import svm
from sklearn.svm import LinearSVC
import itertools
import time
from multiprocessing.dummy import Pool as ThreadPool 

In [2]:
filename = "indexed_learning_data.csv"
data = []
with open(filename) as f:
    r = csv.reader(f, delimiter=',')
    for line in r:
        data.append(line)
data = np.array(data[1:])

In [3]:
def validation_split(data, ratio):
    shuffleData = np.copy(data)
    np.random.shuffle(shuffleData)
    border = int(ratio*len(shuffleData))
    trainSet = shuffleData[:border]
    valSet = shuffleData[border:]
    Xt = trainSet[:,5:-2].astype(float)
    Yt = trainSet[:,-1].astype(int)
    Xv = valSet[:,5:-2].astype(float)
    Yv = valSet[:,-1].astype(int)
    Xt = np.delete(Xt, 11, 1)
    Xv = np.delete(Xv, 11, 1)
    return Xt, Yt, Xv, Yv

Xt, Yt, Xv, Yv = validation_split(data, 0.8)
print (Xt)
print (Yt)
print (Xv)
print (Yv)

[[  4.86460000e+05   2.56500000e+01   5.28900000e+01 ...,   4.91500000e+01
    5.11900000e+01   4.90800000e+03]
 [  4.58697000e+05   3.31900000e+01   4.83000000e+01 ...,   4.37300000e+01
    4.65400000e+01   1.19000000e+02]
 [  4.52080000e+05   3.51800000e+01   6.27800000e+01 ...,   5.84300000e+01
    6.04300000e+01   1.09400000e+03]
 ..., 
 [  4.58486000e+05   3.03500000e+01   3.86200000e+01 ...,   3.46100000e+01
    3.60800000e+01   1.32000000e+02]
 [  4.51165000e+05   4.01000000e+01   5.18900000e+01 ...,   4.98000000e+01
    5.10700000e+01   5.60000000e+01]
 [  4.50080000e+05   4.27800000e+01   6.88000000e+01 ...,   6.17500000e+01
    6.42900000e+01   5.43000000e+02]]
[0 1 0 ..., 1 6 1]
[[  4.59704000e+05   2.89900000e+01   4.97900000e+01 ...,   4.78500000e+01
    4.91900000e+01   5.80000000e+01]
 [  4.59690000e+05   2.62700000e+01   4.41700000e+01 ...,   4.14800000e+01
    4.24000000e+01   1.60200000e+03]
 [  4.61350000e+05   6.25100000e+01   8.83500000e+01 ...,   7.71100000e+01
  

In [4]:
def average_accuracy(data, loops = 1000, c = 64, kernelType = 'rbf'):
    score_list = []
    for i in range(loops):
        Xt, Yt, Xv, Yv = validation_split(data, 0.8)
        svc = svm.SVC(kernel = kernelType, C = c)
        svc = svc.fit(Xt, Yt)
        score_list.append(svc.score(Xv, Yv))
    return np.mean(np.array(score_list))

In [58]:
bestAcc = 0.0
bestI = -6
for i in range(-5,15):
    acc = average_accuracy(data, loops = 100, c = (2**i))
    print (acc, i)
    if acc > bestAcc:
        bestAcc = acc
        bestI = i
print (bestAcc, bestI)

0.323880208333 -5
0.322135416667 -4
0.319622395833 -3
0.322708333333 -2
0.336328125 -1
0.3564453125 0
0.372083333333 1
0.369388020833 2
0.372669270833 3
0.374283854167 4
0.374453125 5
0.375208333333 6
0.3726171875 7
0.374635416667 8
0.373684895833 9
0.374583333333 10
0.374296875 11
0.3735546875 12
0.373346354167 13
0.371731770833 14
0.375208333333 6


In [60]:
print (average_accuracy(data, loops = 100))

0.373307291667


In [43]:
filename = "indexed_learning_val.csv"
new_data = []
with open(filename) as f:
    r = csv.reader(f, delimiter=';')
    for line in r:
        new_data.append(line)
new_data = np.array(new_data[1:])

In [56]:
svc = svm.SVC(C = 32)
X = data[:,5:-2].astype(float)
Y = data[:,-1].astype(int)
new_X = new_data[:,5:-2].astype(float)
new_Y = new_data[:,-1].astype(int)
X = np.delete(X, 11, 1)
new_X = np.delete(new_X, 11, 1)
svc = svc.fit(X, Y)
svc.score(new_X, new_Y)

0.17801047120418848

In [4]:
def my_function(subset, data, loops = 100, c = 64, kernelType = 'rbf'):
    times = []
    accuracy = []
    for i in range(loops):
        start = time.time()
        Xt, Yt, Xv, Yv = validation_split(data, 0.8)
        varXt = Xt[:, subset]
        varXv = Xv[:, subset]
        svc = svm.SVC(kernel = kernelType, C = c)
        svc = svc.fit(varXt, Yt)
        accuracy.append(svc.score(varXv, Yv))
        stop = time.time()
        times.append(stop-start)
    score = np.mean(np.array(accuracy))/(np.mean(times))
    return (score, subset)

def optimal_feature_finder(data, loops = 100, c = 64, kernelType = 'rbf'):
    Xt, Yt, Xv, Yv = validation_split(data, 0.8)
    indices = range(len(Xt[0]))
    end = []
    previous_best = (0, [])
    for L in range(1, len(indices)+1):
        print ("Progress: ", L)
        subsets = [previous_best[1]+[i] for i in indices if i not in previous_best[1]]
        pool = ThreadPool(8) 
        results = pool.starmap(my_function, zip(subsets, itertools.repeat(data), itertools.repeat(loops), itertools.repeat(c), itertools.repeat(kernelType)))
        pool.close() 
        pool.join() 
        end.append(max(results,key=lambda item:item[0]))
        previous_best = end[-1]
    return end

In [5]:
feature_scores = optimal_feature_finder(data, loops=500)
print (feature_scores)
print (max(feature_scores,key=lambda item:item[0]))

Progress:  1
Progress:  2
Progress:  3
Progress:  4
Progress:  5
Progress:  6
Progress:  7
Progress:  8
Progress:  9
Progress:  10
Progress:  11
Progress:  12
Progress:  13
Progress:  14
[(0.4837377408931805, [1]), (0.7089594394280162, [1, 9]), (0.84617444999562041, [1, 9, 5]), (0.88017112291544986, [1, 9, 5, 6]), (0.91377201784258133, [1, 9, 5, 6, 12]), (0.58883001526239009, [1, 9, 5, 6, 12, 7]), (0.58487851867473883, [1, 9, 5, 6, 12, 7, 8]), (0.57239530525329019, [1, 9, 5, 6, 12, 7, 8, 10]), (0.58620613020391132, [1, 9, 5, 6, 12, 7, 8, 10, 3]), (0.58784942804896123, [1, 9, 5, 6, 12, 7, 8, 10, 3, 2]), (0.60905141052953582, [1, 9, 5, 6, 12, 7, 8, 10, 3, 2, 11]), (0.28324263952314044, [1, 9, 5, 6, 12, 7, 8, 10, 3, 2, 11, 0]), (0.17763683738756716, [1, 9, 5, 6, 12, 7, 8, 10, 3, 2, 11, 0, 13]), (0.10529244206247786, [1, 9, 5, 6, 12, 7, 8, 10, 3, 2, 11, 0, 13, 4])]
(0.91377201784258133, [1, 9, 5, 6, 12])
