In [1]:
!pip install optuna -q

In [2]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

import optuna

In [3]:
np.random.seed(42)
random.seed(42)

In [4]:

X_test_ = pd.read_csv('../data/Xte.csv',sep=',',index_col=0)
X_train_ = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0)

X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0)

In [5]:
y['Bound'] = y.Bound.apply(lambda x: -1 if x == 0 else 1)
y.head()
y = y.Bound.values
y

array([ 1, -1,  1, ...,  1,  1,  1])

In [6]:
# print('x_train: {} y_train {}'.format(X_preprocess[:2000,:].shape,y.shape))
# print('test: {}'.format(X_preprocess[2000:,:].shape))

In [30]:

def get_train_test(X,y,p):
    X = scale(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=p, random_state=42)
    print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

# Model Test

In [8]:
def rbf_kernel_element_wise(x, y, sigma=1):
    K =  np.exp(-np.sum((x-y)**2)/(2*sigma**2))
    return K

def rbf_kernel(X1, X2, sigma=10):
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def gaussian_kernel(x, y, sigma=5.0):
    return np.exp(-np.linalg.norm(x-y)**2 / (2 * (sigma ** 2)))


def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def polynomial_kernel(x, y, p=2):
    return (1 + np.dot(x, y)) ** p


def LevenshteinDistance(str1,str2):
    '''
    Compute the edit distance between str1 and str2
    Param: @(str1): (str) string 1 for the comparison
    @(str2): (str) string 2 for the comparison
    Return (int) distance
    '''
    len_s1 = len(str1) +1
    len_s2 = len(str2) +1
    m = np.zeros((len_s1,len_s2))
    for i in range(len_s1):
        m[i,0] = i
    
    for j in range(len_s2):
        m[0,j] = j
    
    for i in range(1,len_s1):
        for j in range(1,len_s2):
            if str1[i-1]==str2[j-1]:
                m[i,j]= min(m[i-1,j]+1,m[i,j-1]+1,m[i-1,j-1])
            else:
                m[i,j] =min(m[i-1,j]+1,m[i,j-1]+1,m[i-1,j-1]+1)
    return m[-1,-1]


def rbf_kernel(X1, X2, sigma=10):
    '''
    Returns the kernel matrix K(X1_i, X2_j): size (n1, n2)
    where K is the RBF kernel with parameter sigma
    
    Input:
    ------
    X1: an (n1, p) matrix
    X2: an (n2, p) matrix
    sigma: float
    '''
    # For loop with rbf_kernel_element works but is slow in python
    # Use matrix operations!
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    '''
    Returns the median of ||Xi-Xj||
    
    Input
    -----
    X: (n, p) matrix
    '''
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def linear_kernel(X1, X2):
    '''
    Returns the kernel matrix K(X1_i, X2_j): size (n1, n2)
    where K is the linear kernel
    
    Input:
    ------
    X1: an (n1, p) matrix
    X2: an (n2, p) matrix
    '''
    return X1.dot(X2.T)

def quadratic_kernel(X1, X2,power=2):
    '''
    Returns the kernel matrix K(X1_i, X2_j): size (n1, n2)
    where K is the quadratic kernel
    
    Input:
    ------
    X1: an (n1, p) matrix
    X2: an (n2, p) matrix
    '''
    return (1 + linear_kernel(X1, X2))**power

# distance = np.zeros((len(X_preprocess),len(X_preprocess)))
# for i in range(len(X_preprocess)):
#     for j in range(len(X_preprocess)):
#         distance[i,j] = LevenshteinDistance(X_preprocess[i],X_preprocess[j])

In [9]:
class KernelMethodBase(object):
    '''
    Base class for kernel methods models
    
    Methods
    ----
    fit
    predict
    '''
    kernels_ = {
        'linear': linear_kernel,
        'quadratic': quadratic_kernel,
        'rbf': rbf_kernel,
        'gaussian':gaussian_kernel
    }
    def __init__(self, kernel='linear', **kwargs):
        self.kernel_name = kernel
        self.kernel_function_ = self.kernels_[kernel]
        self.kernel_parameters = self.get_kernel_parameters(**kwargs)
        
    def get_kernel_parameters(self, **kwargs):
        params = {}
        if self.kernel_name == 'rbf':
            params['sigma'] = kwargs.get('sigma', None)
        return params

    def fit(self, X, y, **kwargs):
        return self
        
    def decision_function(self, X):
        pass

    def predict(self, X):
        pass

In [10]:
class KernelRidgeRegression(KernelMethodBase):
    '''
    Kernel Ridge Regression
    '''
    def __init__(self, lambd=0.1, **kwargs):
        self.lambd = lambd
        # Python 3: replace the following line by
        # super().__init__(**kwargs)
        super(KernelRidgeRegression, self).__init__(**kwargs)

    def fit(self, X, y, sample_weights=None):
        n, p = X.shape
        assert (n == len(y))
    
        self.X_train = X
        self.y_train = y
        
        if sample_weights is not None:
            w_sqrt = np.sqrt(sample_weights)
            self.X_train = self.X_train * w_sqrt[:, None]
            self.y_train = self.y_train * w_sqrt
        
        A = self.kernel_function_(X,X,**self.kernel_parameters)
        A[np.diag_indices_from(A)] = np.add(A[np.diag_indices_from(A)],n*self.lambd)
        # self.alpha = (K + n lambda I)^-1 y
        self.alpha = np.linalg.solve(A , self.y_train)

        return self
    
    def decision_function(self, X):
        K_x = self.kernel_function_(X,self.X_train, **self.kernel_parameters)
        return K_x.dot(self.alpha)
    
    def predict(self, X):
        return self.decision_function(X)

In [11]:
def cross_validate(x_data,y_data,kernel=None,lambd=0.2,sigma=0.5,k=5,power=2):
    if len(x_data)%k != 0:
        print('cant vsplit',len(x_data),' by ',k)
        return
    
    x_data_splitted = np.vsplit(x_data,k)
    y_data_splitted = np.vsplit(y_data.reshape(-1,1),k)
    
    aggrigate_result = []
    for i in range(len(x_data_splitted)):
        train = []
        test = []
        items = [j for j in range(len(x_data_splitted)) if j !=i ]
        x_test = x_data_splitted[i]
        y_test = y_data_splitted[i]
        for item in items:
            if len(train) == 0:
                x_train = x_data_splitted[item]
                y_train = y_data_splitted[item]
            else:
                x_train = np.concatenate((x_train,x_data_splitted[item]), axis=0)
                y_train = np.concatenate((y_train,y_data_splitted[item]), axis=0)
            
            
        model = KernelRidgeRegression(
                kernel=kernel,
                lambd=lambd,
                sigma=sigma,
                power=power
            ).fit(x_train, y_train)
        result = sum(np.sign(model.predict(x_test))==y_test)/len(y_test)
        aggrigate_result.append(result)
        
        value = sum(aggrigate_result)/len(aggrigate_result)
    return value

In [12]:
# cross_validate(X_preprocess_train, y,kernel='',lambd=0.001,k=4,sigma=0.2)

In [13]:
train_data = pd.concat([X_train_ , pd.DataFrame(y)],axis=1)

def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

train_data['words'] = train_data.seq.apply(lambda x: ' '.join(getKmers(x)))
X_test_['words'] = X_test_.seq.apply(lambda x: ' '.join(getKmers(x)))
train_data.head(2)

Unnamed: 0,seq,0,words
0,GAGGGGCTGGGGAGGGGGCTGGCCCAGAGGCACCAGACTCTGCAGA...,1,gagggg aggggc ggggct gggctg ggctgg gctggg ctgg...
1,CGGCCTGGGGGCCACATGTGAGTGCTTACCTGTGTGGGGATGAGGG...,-1,cggcct ggcctg gcctgg cctggg ctgggg tggggg gggg...


In [24]:
from sklearn.feature_extraction.text import CountVectorizer


data = pd.DataFrame(pd.concat([train_data.words,X_test_.words],axis=0))

train_text = data.words.values

cv = CountVectorizer(ngram_range=(2,2),max_features=1500,min_df=10,binary=True)
X = cv.fit_transform(train_text)
X = X.todense()


X.shape

(3000, 1500)

In [25]:
cross_validate(np.array(X)[:2000,:],y,kernel='linear',lambd=0.001,k=4,sigma=0.2)

array([0.6005])

In [26]:
def objective(trial):
    lambd = trial.suggest_loguniform('lambd', 1e-5, 1)
    sigma = trial.suggest_loguniform('sigma', 1e-5, 1)
    k =  trial.suggest_categorical('k', [4,5,8,10])
    p =  trial.suggest_int('p', 2,15)
    kernel =  trial.suggest_categorical('kernel', ['linear','rbf','quadratic'])
    
    return cross_validate(np.array(X)[:2000,:],y,kernel=kernel,lambd=lambd,k=4,sigma=sigma,power=p)

In [27]:
# cross_validate(X_train_mat100, y,lamda=0.01,k=4)
import optuna

sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
df = study.optimize(func=objective, n_trials=200,show_progress_bar=True)


Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

[32m[I 2020-05-28 12:36:00,507][0m Finished trial#0 with value: 0.641 with parameters: {'lambd': 0.40813845189682924, 'sigma': 0.0014596561743742342, 'k': 4, 'p': 9, 'kernel': 'linear'}. Best is trial#0 with value: 0.641.[0m
[32m[I 2020-05-28 12:36:03,380][0m Finished trial#1 with value: 0.62 with parameters: {'lambd': 0.00646555934554108, 'sigma': 0.12798174146391406, 'k': 4, 'p': 9, 'kernel': 'linear'}. Best is trial#0 with value: 0.641.[0m
[32m[I 2020-05-28 12:36:06,263][0m Finished trial#2 with value: 0.643 with parameters: {'lambd': 0.21968056609450912, 'sigma': 0.000760249383742939, 'k': 8, 'p': 10, 'kernel': 'linear'}. Best is trial#2 with value: 0.643.[0m
[32m[I 2020-05-28 12:36:09,233][0m Finished trial#3 with value: 0.0 with parameters: {'lambd': 0.13364239946343992, 'sigma': 2.944935058784592e-05, 'k': 4, 'p': 10, 'kernel': 'rbf'}. Best is trial#2 with value: 0.643.[0m
[32m[I 2020-05-28 12:36:12,169][0m Finished trial#4 with value: 0.0205 with parameters: {'lam

In [28]:
df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
df.sort_values(by=['value'])

Unnamed: 0,number,value,duration,params_k,params_kernel,params_lambd,params_p,params_sigma
69,69,0.000,00:00:02.940976,5,rbf,0.070842,11,0.000025
128,128,0.000,00:00:02.954188,8,rbf,0.115490,6,0.000695
3,3,0.000,00:00:02.964010,4,rbf,0.133642,10,0.000029
5,5,0.000,00:00:02.938261,5,rbf,0.000015,9,0.002442
98,98,0.000,00:00:07.319899,8,rbf,0.081789,6,0.000319
...,...,...,...,...,...,...,...,...
92,92,0.647,00:00:02.875500,8,quadratic,0.071926,5,0.000620
101,101,0.647,00:00:06.887361,8,quadratic,0.071123,4,0.000567
158,158,0.647,00:00:02.886835,8,quadratic,0.070415,5,0.000352
111,111,0.647,00:00:02.897981,8,quadratic,0.070282,5,0.000545


In [36]:
X_train, X_test, y_train, y_test = get_train_test(np.array(X)[:2000,:],y,0.01)

(1980, 1500) (20, 1500) (1980,) (20,)


In [37]:
model = KernelRidgeRegression(
                kernel='quadratic',
                lambd=0.071162,
                sigma=0.000267,
                power=5
            ).fit(X_train, y_train)
result = sum(np.sign(model.predict(X_test))==y_test)/len(y_test)
result

0.6

In [35]:
cross_validate(np.array(X)[:2000,:],y,kernel='quadratic',lambd=0.071162,k=4,sigma=0.000267)

array([0.647])

In [44]:
X_test_final = scale(np.array(X)[2000:,:])


sumbission = []
for i in range(len(X_test_final)):
    r1 = np.sign(model.predict(X_test_final[i]))
    
    if r1 == 1:
        sumbission.append([i,int(r1)])
    elif r1 == -1:
        sumbission.append([i,0])
    else:
        print('problem')
        
    
# sumbission
df = pd.DataFrame(sumbission)
df.columns = ['Id','Bound']
df.to_csv('cv_Kernel_ridge_quadratic_64.7.csv',index=False)

df.head(15)

Unnamed: 0,Id,Bound
0,0,1
1,1,1
2,2,0
3,3,0
4,4,1
5,5,1
6,6,0
7,7,1
8,8,1
9,9,1


In [43]:
df.shape

(1000, 2)