In [1]:
import numpy as np
import math
from liblinear.liblinearutil import *
from libsvm.svmutil import *
from scipy import *
import time
import matplotlib.pylab as plt
from sklearn.decomposition import PCA
import pandas as pd
from scipy.sparse import csr_matrix

#I used the subset.py file found at https://github.com/cjlin1/libsvm/blob/master/tools/subset.py
#to randomly sampled 10000 data and saved to new file new.libsvm.binary
#the script used at terminal:
#python subset.py -1 covtype.libsvm.binary 10000 new.libsvm.binary

y, x = svm_read_problem('new.libsvm.binary', return_scipy = True)



In [2]:
#scale data to [-1,1]
scale_param = csr_find_scale_param(x, lower=-1, upper=1)
x = csr_scale(x, scale_param)
x = csr_matrix.toarray(x)



If feature values are non-negative and sparse, set lower=0 rather than the default lower=-1.
       > new      #nonzeros 529994
If feature values are non-negative and sparse, get scale_param by setting lower=0 rather than the default lower=-1.


In [3]:
prob = problem(y,x)
param0 = parameter('-s 0')
param1 = parameter('-s 1')
param2 = parameter('-s 2')


In [4]:
%%time

#linear 
m0 = train(prob, param0)
pl, pa, pv = predict(y,x,m0)



Accuracy = 75.71% (7571/10000) (classification)
Wall time: 1.3 s


In [5]:
%%time

#primal
m1 = train(prob, param1)
pl1, pa1, pv1 = predict(y,x,m1)

Accuracy = 75.88% (7588/10000) (classification)
Wall time: 2.13 s


In [6]:
%%time

#dual
m2 = train(prob, param2)
pl2, pa2, pv2 = predict(y,x,m2)

Accuracy = 75.91% (7591/10000) (classification)
Wall time: 1.3 s


In [7]:
%%time

#kernel method, gaussian
prob3 = svm_problem(y,x)
param3 = '-t 2'
m3 = svm_train(prob3, param3)
pl3, pa3, pv3 = svm_predict(y,x,m3)


Accuracy = 73.95% (7395/10000) (classification)
Wall time: 41.1 s


In [8]:
#applied pca to 2d for visualization
pca = PCA(n_components=2)          
pca = pca.fit(x)                    
X_dr = pca.transform(x)   

In [9]:
colors = ['green', 'red', 'blue']

for i in range(len(colors)):
    plt.scatter(X_dr[y == i, 0] ,X_dr[y == i, 1], alpha=0.5, c=colors[i])
plt.show()


In [10]:
#applied pca reduce to 5d
#and trained linear, primal, dual, kernal again
pca = PCA(n_components=5)          
pca = pca.fit(x)                    
X_dr2 = pca.transform(x)   


In [11]:
%%time
prob4 = train(y, X_dr2, '-s 0')
pl4, pa4, pv4 = predict(y, X_dr, prob4)

Accuracy = 55.32% (5532/10000) (classification)
Wall time: 658 ms


In [12]:
%%time
prob4 = train(y, X_dr2, '-s 1')
pl4, pa4, pv4 = predict(y, X_dr, prob4)

Accuracy = 55.38% (5538/10000) (classification)
Wall time: 684 ms


In [13]:
%%time
prob4 = train(y, X_dr2, '-s 2')
pl4, pa4, pv4 = predict(y, X_dr, prob4)

Accuracy = 55.38% (5538/10000) (classification)
Wall time: 672 ms


In [14]:
%%time
prob4 = svm_train(y, X_dr2, '-t 2')
pl4, pa4, pv4 = svm_predict(y, X_dr, prob4)

Accuracy = 61.05% (6105/10000) (classification)
Wall time: 6.35 s
