In [1]:
!pip install cvxopt -q
!pip install optuna -q

In [2]:
import sklearn
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
import optuna
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score
from numpy import linalg
import cvxopt
import cvxopt.solvers
import sklearn

import os
cvxopt.solvers.options['show_progress'] = False

In [3]:

X_test_ = pd.read_csv('../data/Xte.csv',sep=',',index_col=0)
X_train_ = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0)

X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0)

In [4]:
y['Bound'] = y.Bound.apply(lambda x: -1 if x == 0 else 1)
y.head()
y = y.Bound.values
y

array([ 1, -1,  1, ...,  1,  1,  1])

In [5]:
def get_train_test(X,y,p):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=p, random_state=42)
    print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

def base2int(c):
    return {'a':0,'c':1,'g':2,'t':3}.get(c,0)

def index(kmer):
    base_idx = np.array([base2int(base) for base in kmer])
    multiplier = 4** np.arange(len(kmer))
    kmer_idx = multiplier.dot(base_idx)
    return kmer_idx
    
    
def spectral_embedding(sequence,kmer_size=3):
    kmers = getKmers(sequence,kmer_size)
    kmer_idxs = [index(kmer) for kmer in kmers]
    one_hot_vector = np.zeros(4**kmer_size)
    
    for kmer_idx in kmer_idxs:
        one_hot_vector[kmer_idx] += 1
    return one_hot_vector


def get_data(kmer_size):
    data = pd.DataFrame(pd.concat([X_train_.seq,X_test_.seq],axis=0))
    train_text = data.seq.values
    # X_train_['kmers'] = X_train_.seq.apply(lambda x:list(spectral_embedding(x,kmer_size=3)))
    kmer_data = []
    for i in train_text:
        kmer_data.append(spectral_embedding(i,kmer_size=kmer_size))

    return np.array(kmer_data)

In [6]:
def rbf_kernel_element_wise(x, y, sigma=1):
    K =  np.exp(-np.sum((x-y)**2)/(2*sigma**2))
    return K

def rbf_kernel(X1, X2, sigma=10):
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def gaussian_kernel(x, y, sigma=5.0):
    return np.exp(-np.linalg.norm(x-y)**2 / (2 * (sigma ** 2)))

def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def polynomial_kernel(X1, X2, power=2):
    return np.power((1 + linear_kernel(X1, X2)),power)


def rbf_kernel(X1, X2, sigma=10):
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def linear_kernel(X1, X2):
    return X1.dot(X2.T)

In [7]:
class KernelMethodBase(object):
    '''
    Base class for kernel methods models
    
    Methods
    ----
    fit
    predict
    '''
    kernels_ = {
        'linear': linear_kernel,
        'polynomial': polynomial_kernel,
        'rbf': rbf_kernel,
        'gaussian':gaussian_kernel
    }
    def __init__(self, kernel='linear', **kwargs):
        self.kernel_name = kernel
        self.kernel_function_ = self.kernels_[kernel]
        self.kernel_parameters = self.get_kernel_parameters(**kwargs)
        
    def get_kernel_parameters(self, **kwargs):
        params = {}
        if self.kernel_name == 'rbf' or self.kernel_name == 'gaussian':
            params['sigma'] = kwargs.get('sigma', None)
        if self.kernel_name == 'polynomial':
            params['power'] = kwargs.get('power', None)
            
        
        return params

    def fit(self, X, y, **kwargs):
        return self
        
    def decision_function(self, X):
        pass

    def predict(self, X):
        pass

In [17]:



class KernelSVM(KernelMethodBase):
    def __init__(self, C=0.1, **kwargs):
        self.C = C
        # Python 3: replace the following line by
        # super().__init__(**kwargs)
        super(KernelSVM, self).__init__(**kwargs)
        
    def cvxopt_qp(self,P, q, G, h, A, b):
        P = .5 * (P + P.T)
        cvx_matrices = [
            cvxopt.matrix(M) if M is not None else None for M in [P, q, G, h, A, b] 
        ]
        #cvxopt.solvers.options['show_progress'] = False
        solution = cvxopt.solvers.qp(*cvx_matrices, options={'show_progress': False})
        return np.array(solution['x']).flatten()
    
    def svm_dual_soft_to_qp_kernel(self,K, y, C=1):
        n = K.shape[0]
        assert (len(y) == n)

        # Dual formulation, soft margin
        # P = np.diag(y) @ K @ np.diag(y)
        P = np.diag(y).dot(K).dot(np.diag(y))
        # As a regularization, we add epsilon * identity to P
        eps = 1e-12
        P += eps * np.eye(n)
        q = - np.ones(n)
        G = np.vstack([-np.eye(n), np.eye(n)])
        h = np.hstack([np.zeros(n), C * np.ones(n)])
        A = y[np.newaxis, :]
        b = np.array([0.])
        return P, q, G, h, A.astype(float), b


    def fit(self, X, y, tol=1e-8):
        n, p = X.shape
        assert (n == len(y))
    
        self.X_train = X
        self.y_train = y
        
        # Kernel matrix
        K = self.kernel_function_(
            self.X_train, self.X_train, **self.kernel_parameters)
        
        # Solve dual problem
        self.alpha = self.cvxopt_qp(*self.svm_dual_soft_to_qp_kernel(K, y, C=self.C))
        
        # Compute support vectors and bias b
        sv = np.logical_and((self.alpha > tol), (self.C - self.alpha > tol))
        self.bias = y[sv] - K[sv].dot(self.alpha * y)
        self.bias = self.bias.mean()

        self.support_vector_indices = np.nonzero(sv)[0]

        return self
        
    def decision_function(self, X):
        K_x = self.kernel_function_(X, self.X_train, **self.kernel_parameters)
        return K_x.dot(self.alpha * self.y_train) + self.bias

    def predict(self, X):
        return np.sign(self.decision_function(X))
    
    


(1400, 4096) (600, 4096) (1400,) (600,)


0.6583333333333333

In [19]:
def cross_validate(x_data,y_data,lr=None,kernel=None,lambd=0.2,C=3,sigma=0.5,k=5,power=2):
    if len(x_data)%k != 0:
        print('cant vsplit',len(x_data),' by ',k)
        return
    
    x_data_splitted = np.vsplit(x_data,k)
    y_data_splitted = np.vsplit(y_data.reshape(-1,1),k)
    
    aggrigate_result = []
    for i in range(len(x_data_splitted)):
        train = []
        test = []
        items = [j for j in range(len(x_data_splitted)) if j !=i ]
        x_test = x_data_splitted[i]
        y_test = y_data_splitted[i]
        for item in items:
            if len(train) == 0:
                x_train = x_data_splitted[item]
                y_train = y_data_splitted[item]
            else:
                x_train = np.concatenate((x_train,x_data_splitted[item]), axis=0)
                y_train = np.concatenate((y_train,y_data_splitted[item]), axis=0)
        
        model = KernelSVM(C=C,
                          kernel=kernel,
                          lambd=lambd,
                          sigma=sigma,
                          power=power)
        model.fit(x_train, y_train.flatten())
        y_pred = model.predict(x_test)

        result = sum((y_pred.flatten()==y_test.flatten()))/len(y_test)
            
        aggrigate_result.append(result)
        
        value = sum(aggrigate_result)/len(aggrigate_result)
    return value

In [29]:
def objective(trial):
    lambd = trial.suggest_float('lambd', 1e-5, 100.0)
    sigma = trial.suggest_float('sigma', 1e-5, 150)
    k =  trial.suggest_categorical('k', [4,5,8])
    C =  trial.suggest_float('C', 0.1,50)
    power =  trial.suggest_int('power', 2,5)
#     kmer_size =  trial.suggest_int('kmer_size', 3,8)
#     kernel =  trial.suggest_categorical('kernel', ['linear','rbf','gaussian_kernel','polynomial'])
#     model_name
    
    return cross_validate(get_data(6)[:2000,:],y,kernel='polynomial',k=4,sigma=sigma,power=power,C=C)

# cross_validate(X_train_mat100, y,lamda=0.01,k=4)
import optuna

sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
df = study.optimize(func=objective, n_trials=1000,show_progress_bar=True)


Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

[32m[I 2020-05-31 13:01:19,418][0m Finished trial#0 with value: 0.612 with parameters: {'lambd': 80.11675843459803, 'sigma': 98.30353640078559, 'k': 5, 'C': 0.6039284433351343, 'power': 3}. Best is trial#0 with value: 0.612.[0m
[32m[I 2020-05-31 13:01:23,428][0m Finished trial#1 with value: 0.612 with parameters: {'lambd': 79.82900921988283, 'sigma': 59.84800596432479, 'k': 8, 'C': 32.32172447722335, 'power': 3}. Best is trial#0 with value: 0.612.[0m
[32m[I 2020-05-31 13:01:27,378][0m Finished trial#2 with value: 0.6015 with parameters: {'lambd': 1.310158489808897, 'sigma': 0.7730575451278214, 'k': 8, 'C': 20.360529955188415, 'power': 4}. Best is trial#0 with value: 0.612.[0m
[32m[I 2020-05-31 13:01:31,123][0m Finished trial#3 with value: 0.6455 with parameters: {'lambd': 65.59188543846157, 'sigma': 146.7465401451798, 'k': 4, 'C': 3.0521146298665878, 'power': 2}. Best is trial#3 with value: 0.6455.[0m
[32m[I 2020-05-31 13:01:34,950][0m Finished trial#4 with value: 0.6455 


Mean of empty slice.


invalid value encountered in double_scalars



[32m[I 2020-05-31 13:01:38,810][0m Finished trial#5 with value: 0.0 with parameters: {'lambd': 52.96236174864882, 'sigma': 137.95187790436904, 'k': 4, 'C': 46.18292294045505, 'power': 5}. Best is trial#3 with value: 0.6455.[0m
[32m[I 2020-05-31 13:01:42,610][0m Finished trial#6 with value: 0.6455 with parameters: {'lambd': 63.90058621209748, 'sigma': 14.159516065576364, 'k': 4, 'C': 18.210042860372322, 'power': 2}. Best is trial#3 with value: 0.6455.[0m
[32m[I 2020-05-31 13:01:46,752][0m Finished trial#7 with value: 0.612 with parameters: {'lambd': 91.26636336671538, 'sigma': 105.65377541198846, 'k': 5, 'C': 44.643516074508085, 'power': 3}. Best is trial#3 with value: 0.6455.[0m
[32m[I 2020-05-31 13:01:50,645][0m Finished trial#8 with value: 0.612 with parameters: {'lambd': 89.16008637769038, 'sigma': 76.22028667816654, 'k': 8, 'C': 27.282775607818092, 'power': 3}. Best is trial#3 with value: 0.6455.[0m
[32m[I 2020-05-31 13:01:54,546][0m Finished trial#9 with value: 0.0 w

In [30]:
cross_validate(get_data(6)[:2000,:],y,kernel='polynomial',k=4,sigma=50,power=2,C=1)

0.6455

In [18]:
X_train, X_test, y_train, y_test = get_train_test(get_data(6)[:2000,:],y,p=0.3)


kernel = 'polynomial'
power = 2
sigma = 50.
C = 1.
model = KernelSVM(C=C, kernel=kernel, sigma=sigma, power=power)
y_pred = model.fit(X_train, y_train).predict(X_test)

sum(y_pred==y_test)/len(y_test)


X_test_final  = model.predict(get_data(6)[2000:,:])
sumbission = []
for i in range(len(X_test_final)):
    r1 = X_test_final[i]
    if r1 == 1:
        sumbission.append([i,int(r1)])
    elif r1 == -1:
        sumbission.append([i,0])
    else:
        print('problem')
        
    
# sumbission
df = pd.DataFrame(sumbission)
df.columns = ['Id','Bound']
df.to_csv('cv_-svm.csv',index=False)

df.head(15)

Unnamed: 0,Id,Bound
0,0,1
1,1,1
2,2,0
3,3,0
4,4,0
5,5,1
6,6,1
7,7,1
8,8,1
9,9,1


0.6583333333333333

In [77]:
y_train

array([-1,  1, -1, ..., -1,  1,  1])

In [14]:
type(y_train)

numpy.ndarray