In [298]:
!pip install optuna -q

In [314]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

import optuna

In [315]:

X_test = pd.read_csv('../data/Xte.csv',sep=',',index_col=0).values
X_train = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0).values

X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0)

In [316]:
y['Bound'] = y.Bound.apply(lambda x: -1 if x == 0 else 1)
y.head()
y = y.Bound.values
y

array([ 1, -1,  1, ...,  1,  1,  1])

In [322]:
def getKmers(sequence, size):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
def get_n_grams(data1,data2,n):
    X_train = []
    X_test = []

    cv = CountVectorizer()
    for i in data1:
        sentence = ' '.join(getKmers(i[0], size=n))
        X_train.append(sentence)

    for i in data2:
        sentence = ' '.join(getKmers(i[0], size=n))
        X_test.append(sentence)

    
    X_cocat = X_train + X_test
    X = cv.fit_transform(X_cocat).toarray()
    
    return X

In [323]:
X_preprocess = get_n_grams(X_train,X_test,5)

In [324]:
print('x_train: {} y_train {}'.format(X_preprocess[:2000,:].shape,y.shape))
print('test: {}'.format(X_preprocess[2000:,:].shape))

x_train: (2000, 1024) y_train (2000,)
test: (1000, 1024)


In [328]:
X_preprocess_train = scale(X_preprocess[:2000,:])
X_preprocess_test = scale(X_preprocess[2000:,:])

X_train, X_val, y_train, y_val = train_test_split(
    X_preprocess_train, y, test_size=0.3, random_state=42)

print(X_train.shape,X_val.shape,y_train.shape, y_val.shape)

(1400, 1024) (600, 1024) (1400,) (600,)


# Model Test

In [329]:
def rbf_kernel_element_wise(x, y, sigma=1):
    K =  np.exp(-np.sum((x-y)**2)/(2*sigma**2))
    return K

def rbf_kernel(X1, X2, sigma=10):
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def gaussian_kernel(x, y, sigma=5.0):
    return np.exp(-np.linalg.norm(x-y)**2 / (2 * (sigma ** 2)))


def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def polynomial_kernel(x, y, p=2):
    return (1 + np.dot(x, y)) ** p

In [357]:
class KernelRidge():
    def __init__(self,kernel=None,sigma=None, lambd=0.1,p=None):
        kernels = {'linear':linear_kernel,
                    'rbf':rbf_kernel,
                    'polynomial':polynomial_kernel}
        self.kernel_name= kernel
        self.kernel = kernels[kernel]
        self.sigma = sigma
        self.lambd = lambd
        self.p = p
        self.n = None
        

    def fit(self, X, y):
        n, p = X.shape
        self.n = n
        assert (n == len(y))
    
        self.X_train = X
        if self.kernel_name == 'rbf':
            if self.sigma is None:
                self.sigma = sigma_from_median(X)
            A = self.kernel(X,X,sigma=self.sigma)+(n*self.lambd *np.eye(n))
        elif self.kernel_name == 'polynomial':
            A = self.kernel(X,X.T,self.p)+(n*self.lambd *np.eye(n))
        else:
            A = self.kernel(X,X.T)+(n*self.lambd *np.eye(n))
        
        # self.alpha = (K + n lambda I)^-1 y
        self.alpha = np.linalg.solve(A,y)

        return self
        
    def predict(self, X):
        # Prediction rule:
        if self.kernel_name == 'rbf':
             K_x = self.kernel(X,self.X_train,sigma=self.sigma)+self.n*self.lambd *np.eye(self.n)
        elif self.kernel_name == 'polynomial':
            K_x = self.kernel(X,X.T,self.p)+self.n*self.lambd *np.eye(self.n)
        else:
            K_x = self.kernel(X,self.X_train.T)
            
        return K_x.dot(self.alpha)

In [358]:
def cross_validate(x_data,y_data,kernel=None,lamda=0.2,k=5,p=None):
    if len(x_data)%k != 0:
        print('cant vsplit',len(x_data),' by ',k)
        return
    
    x_data_splitted = np.vsplit(x_data,k)
    y_data_splitted = np.vsplit(y_data.reshape(-1,1),k)
    
    aggrigate_result = []
    for i in range(len(x_data_splitted)):
        train = []
        test = []
        items = [j for j in range(len(x_data_splitted)) if j !=i ]
        x_test = x_data_splitted[i]
        y_test = y_data_splitted[i]
        for item in items:
            if len(train) == 0:
                x_train = x_data_splitted[item]
                y_train = y_data_splitted[item]
            else:
                x_train = np.concatenate((x_train,x_data_splitted[item]), axis=0)
                y_train = np.concatenate((y_train,y_data_splitted[item]), axis=0)
            
        model = KernelRidge(kernel=kernel,lambd=lamda, sigma=None,p=p)
        model = model.fit(x_train, y_train)
        
        #sum(np.sign(model.predict(X_train))==y_train)/len(y_train)
        result = sum(np.sign(model.predict(x_test))==y_test)/len(y_test)
        aggrigate_result.append(result)
        
        value = sum(aggrigate_result)/len(aggrigate_result)
    return value

In [359]:
cross_validate(X_preprocess_train, y,kernel='rbf',lamda=0.001,k=4,p=2)

array([0.544])

In [360]:
def objective(trial):
    lamda = trial.suggest_loguniform('lamda', 1e-3, 0.5)
    k =  trial.suggest_categorical('k', [4,5,8,10])
    p =  trial.suggest_int('p', 2,5)
    kernel =  trial.suggest_categorical('kernel', ['linear','rbf','polynomial'])
    
    return cross_validate(X_preprocess_train, y,kernel=kernel,lamda=lamda,k=k,p=p)

In [342]:
# cross_validate(X_train_mat100, y,lamda=0.01,k=4)
import optuna

sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(func=objective, n_trials=200,show_progress_bar=True)


Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

[32m[I 2020-05-26 21:48:35,937][0m Finished trial#0 with value: 0.5810000000000001 with parameters: {'lamda': 0.0022408964743672113, 'k': 8, 'p': 4, 'kernel': 'linear'}. Best is trial#0 with value: 0.5810000000000001.[0m
[32m[I 2020-05-26 21:48:36,130][0m Finished trial#1 with value: 0.5705 with parameters: {'lamda': 0.0016897241140727808, 'k': 10, 'p': 5, 'kernel': 'linear'}. Best is trial#0 with value: 0.5810000000000001.[0m
[32m[I 2020-05-26 21:48:36,327][0m Finished trial#2 with value: 0.578 with parameters: {'lamda': 0.0882197576099006, 'k': 5, 'p': 3, 'kernel': 'linear'}. Best is trial#0 with value: 0.5810000000000001.[0m
[32m[I 2020-05-26 21:48:36,485][0m Finished trial#3 with value: 0.5825 with parameters: {'lamda': 0.06298614144040181, 'k': 8, 'p': 3, 'kernel': 'linear'}. Best is trial#3 with value: 0.5825.[0m
[32m[I 2020-05-26 21:48:36,761][0m Finished trial#4 with value: 0.483 with parameters: {'lamda': 0.1335669125229923, 'k': 8, 'p': 5, 'kernel': 'polynomial'

In [352]:
df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
df.sort_values(by=['value'])

Unnamed: 0,number,value,duration,params_k,params_kernel,params_lamda,params_p
197,197,0.4830,00:00:03.006370,8,rbf,0.495006,3
4,4,0.4830,00:00:00.272812,8,polynomial,0.133567,5
10,10,0.4830,00:00:03.057621,8,rbf,0.021250,2
50,50,0.4830,00:00:03.032546,8,rbf,0.278620,5
7,7,0.4835,00:00:02.951925,8,rbf,0.003816,2
...,...,...,...,...,...,...,...
156,156,0.5940,00:00:00.189364,4,linear,0.487480,2
158,158,0.5940,00:00:00.189769,4,linear,0.497074,2
164,164,0.5940,00:00:00.219594,4,linear,0.496951,3
198,198,0.5940,00:00:00.197004,4,linear,0.444473,2


In [None]:

X_test = pd.read_csv('../data/Xte.csv',sep=',',index_col=0).values
X_train = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0).values

X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0)