In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import svm
from sklearn.metrics import accuracy_score
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

# svm tuning

In [2]:
# load data
train = pd.read_csv('../data/train_after_encode.csv')
test = pd.read_csv('../data/test_after_encode.csv')
protein_id = pd.read_csv('../data/protein_id.csv')
# define X,y
X_train = train.drop('y', axis=1)
y_train = train['y']
# create validation sets
#X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size = 0.3)

In [3]:
X_train.shape

(862, 537)

In [4]:
y_train.shape

(862,)

In [5]:
# cast the type to float64
X_train = X_train.astype(np.float64)
y_train = y_train.astype(np.float64)
#X_val = X_val.astype(np.float64)
#y_val = y_val.astype(np.float64)

## linear kernel

In [6]:
# parameter tuning
params_linear = {'C':[2**-5,2**-4,2**-3, 2**-2, 2**-1,1,2, 2**2,2**3,2**4,2**5,2**6,2**7,2**8]}

In [7]:
# create a classifier with c
linear = svm.SVC(kernel='linear')
# grid search 
lin_clf = GridSearchCV(linear,params_linear,cv=10)
# train the classifier
lin_clf.fit(X_train,y_train)



GridSearchCV(cv=10, estimator=SVC(kernel='linear'),
             param_grid={'C': [0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8,
                               16, 32, 64, 128, 256]})

In [8]:
# best parameter
lin_clf.best_params_

{'C': 0.5}

In [9]:
# best predictor
best_lin = lin_clf.best_estimator_
#best_lin = svm.SVC(kernel='linear', C=180).fit(X_train,y_train)

In [10]:
# accuracy score
lin_clf.best_score_

0.6705960973001872

In [11]:
# save the linear kernel
pd.to_pickle(lin_clf, '../best models/SVM_linear.pickle')

## gaussian kernel

In [12]:
# parameter tuning
params_gau = {'C':[2**-5,2**-4,2**-3, 2**-2, 2**-1,1,2, 2**2,2**3,2**4,2**5,2**6,2**7,2**8],
              'gamma':[2**-10,2**-8,2**-6, 2**-4, 2**-2,2**-1, 1, 2, 2**2, 2**3]
             }

In [13]:
# create a classifier with c
gaussian = svm.SVC(kernel='rbf')
# grid search 
gau_clf = GridSearchCV(gaussian,params_gau,cv=10)
# train the classifier
gau_clf.fit(X_train,y_train)



GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8,
                               16, 32, 64, 128, 256],
                         'gamma': [0.0009765625, 0.00390625, 0.015625, 0.0625,
                                   0.25, 0.5, 1, 2, 4, 8]})

In [14]:
# accuracy score
gau_clf.best_score_

0.6660384923817162

In [15]:
# best predictor
best_gau = gau_clf.best_estimator_

In [16]:
# best parameters
gau_clf.best_params_

{'C': 128, 'gamma': 0.0009765625}

## poly kernel

In [17]:
# parameter tuning
params_poly = {'C':[2**-5,2**-4,2**-3, 2**-2, 2**-1,1,2, 2**2,2**3,2**4,2**5,2**6,2**7,2**8,2**9,2**10],
               'degree':[1,2,3,4]}

In [18]:
# fit poly model
#poly = svm.SVC(kernel='poly', degree=1, C=180).fit(X_train,y_train)
# poly accuracy on validation sets
#accuracy_score(y_val, poly.predict(X_val))

In [19]:
# create a poly kernel
poly = svm.SVC(kernel='poly')
# gridsearchcv
poly_clf = GridSearchCV(poly,params_poly,cv=10)
# train the classifier
poly_clf.fit(X_train,y_train)



GridSearchCV(cv=10, estimator=SVC(kernel='poly'),
             param_grid={'C': [0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8,
                               16, 32, 64, 128, 256, 512, 1024],
                         'degree': [1, 2, 3, 4]})

In [20]:
# best predictor
best_poly = poly_clf.best_estimator_

In [21]:
# best parameters
poly_clf.best_params_

{'C': 64, 'degree': 1}

In [22]:
# accuracy score
poly_clf.best_score_

0.6717455225875435

The following is related to submission

In [23]:
# creating the result vector
y_pred_test = best_gau.predict(test).astype(int)

In [24]:
y_pred_test

array([ 5,  0,  0,  3,  0,  1,  0,  0,  5,  1,  1,  1,  0,  1,  0,  0,  1,
        1,  0,  0,  0,  3,  0,  0,  1,  0,  1,  0,  1,  0,  0,  0,  2,  0,
        0,  1,  2,  0, 10,  1,  0,  0,  0,  0,  0,  1,  1,  7,  1,  0,  7,
        0,  5,  0,  1,  0,  1,  4,  1,  1,  1,  0,  0,  3,  1,  1,  1,  5,
        0,  1,  0,  5,  0,  1,  0,  3,  1,  1,  0,  0,  3,  0,  1,  0,  0,
        0,  2,  0,  3,  2,  0,  3,  5,  0,  1,  3,  1,  3,  3,  1,  0,  0,
        1,  0,  3,  0,  4,  0,  1,  1,  6,  0,  3,  4,  0,  0,  1,  0,  0,
        0,  0,  0,  1,  0,  0,  0,  0,  1,  0,  1,  0,  1,  0,  5,  4,  1,
        0,  2,  1,  1,  0,  8,  0,  0,  2,  8,  1,  3,  1,  1,  1,  1,  1,
        2,  0,  1,  1,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  6,  0,  0,
        3,  0,  5,  3,  6,  0,  1,  3,  0,  1,  0,  1,  4,  1,  0,  3,  0,
        1,  0,  6,  0,  0,  0,  1,  4,  3,  1,  0,  7,  2,  1,  0,  2,  0,
        2,  0,  0,  1,  3,  1,  0,  0,  1,  0,  6,  0,  5,  0,  0,  0,  1,
        7,  1,  1,  8,  1

In [25]:
# create result
res = {'key':protein_id['0'], 'label':y_pred_test}
res = pd.DataFrame(res)
# set id to be index
res = res.set_index('key')
# sort by index
res = res.sort_index()

In [26]:
res

Unnamed: 0_level_0,label
key,Unnamed: 1_level_1
P234062,2
P234081,1
P234086,2
P234087,0
P234094,0
...,...
P240380,0
P240407,2
P240440,1
P240462,1


In [27]:
res.to_csv('../result/submission_svm.csv')