In [1]:
!pip install optuna -q

In [2]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

import optuna

In [32]:

X_test_ = pd.read_csv('../data/Xte.csv',sep=',',index_col=0).values
X_train_ = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0).values

X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0)

In [12]:
y['Bound'] = y.Bound.apply(lambda x: -1 if x == 0 else 1)
y.head()
y = y.Bound.values
y

array([ 1, -1,  1, ...,  1,  1,  1])

In [13]:
print('x_train: {} y_train {}'.format(X_preprocess[:2000,:].shape,y.shape))
print('test: {}'.format(X_preprocess[2000:,:].shape))

x_train: (2000, 1024) y_train (2000,)
test: (1000, 1024)


In [14]:

def test_data(X,y):
    results = {}
    X=scale(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    classifier = SVC(kernel = 'linear', C = 0.1, gamma = 0.01)
    classifier.fit(X_train, y_train)
    results['svm'] = classifier.score(X_test,y_test)

    clf = LogisticRegression(random_state=0,penalty='l2').fit(X, y)
    results['lr'] = clf.score(X_test,y_test)
    
    return results

# Model Test

In [15]:
def rbf_kernel_element_wise(x, y, sigma=1):
    K =  np.exp(-np.sum((x-y)**2)/(2*sigma**2))
    return K

def rbf_kernel(X1, X2, sigma=10):
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def gaussian_kernel(x, y, sigma=5.0):
    return np.exp(-np.linalg.norm(x-y)**2 / (2 * (sigma ** 2)))


def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def polynomial_kernel(x, y, p=2):
    return (1 + np.dot(x, y)) ** p


def LevenshteinDistance(str1,str2):
    '''
    Compute the edit distance between str1 and str2
    Param: @(str1): (str) string 1 for the comparison
    @(str2): (str) string 2 for the comparison
    Return (int) distance
    '''
    len_s1 = len(str1) +1
    len_s2 = len(str2) +1
    m = np.zeros((len_s1,len_s2))
    for i in range(len_s1):
        m[i,0] = i
    
    for j in range(len_s2):
        m[0,j] = j
    
    for i in range(1,len_s1):
        for j in range(1,len_s2):
            if str1[i-1]==str2[j-1]:
                m[i,j]= min(m[i-1,j]+1,m[i,j-1]+1,m[i-1,j-1])
            else:
                m[i,j] =min(m[i-1,j]+1,m[i,j-1]+1,m[i-1,j-1]+1)
    return m[-1,-1]


def rbf_kernel(X1, X2, sigma=10):
    '''
    Returns the kernel matrix K(X1_i, X2_j): size (n1, n2)
    where K is the RBF kernel with parameter sigma
    
    Input:
    ------
    X1: an (n1, p) matrix
    X2: an (n2, p) matrix
    sigma: float
    '''
    # For loop with rbf_kernel_element works but is slow in python
    # Use matrix operations!
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    '''
    Returns the median of ||Xi-Xj||
    
    Input
    -----
    X: (n, p) matrix
    '''
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def linear_kernel(X1, X2):
    '''
    Returns the kernel matrix K(X1_i, X2_j): size (n1, n2)
    where K is the linear kernel
    
    Input:
    ------
    X1: an (n1, p) matrix
    X2: an (n2, p) matrix
    '''
    return X1.dot(X2.T)

def quadratic_kernel(X1, X2,power=2):
    '''
    Returns the kernel matrix K(X1_i, X2_j): size (n1, n2)
    where K is the quadratic kernel
    
    Input:
    ------
    X1: an (n1, p) matrix
    X2: an (n2, p) matrix
    '''
    return (1 + linear_kernel(X1, X2))**power

# distance = np.zeros((len(X_preprocess),len(X_preprocess)))
# for i in range(len(X_preprocess)):
#     for j in range(len(X_preprocess)):
#         distance[i,j] = LevenshteinDistance(X_preprocess[i],X_preprocess[j])

In [16]:
class KernelMethodBase(object):
    '''
    Base class for kernel methods models
    
    Methods
    ----
    fit
    predict
    '''
    kernels_ = {
        'linear': linear_kernel,
        'quadratic': quadratic_kernel,
        'rbf': rbf_kernel,
    }
    def __init__(self, kernel='linear', **kwargs):
        self.kernel_name = kernel
        self.kernel_function_ = self.kernels_[kernel]
        self.kernel_parameters = self.get_kernel_parameters(**kwargs)
        
    def get_kernel_parameters(self, **kwargs):
        params = {}
        if self.kernel_name == 'rbf':
            params['sigma'] = kwargs.get('sigma', None)
        return params

    def fit(self, X, y, **kwargs):
        return self
        
    def decision_function(self, X):
        pass

    def predict(self, X):
        pass

In [106]:
class KernelRidgeRegression(KernelMethodBase):
    '''
    Kernel Ridge Regression
    '''
    def __init__(self, lambd=0.1, **kwargs):
        self.lambd = lambd
        # Python 3: replace the following line by
        # super().__init__(**kwargs)
        super(KernelRidgeRegression, self).__init__(**kwargs)

    def fit(self, X, y, sample_weights=None):
        n, p = X.shape
        assert (n == len(y))
    
        self.X_train = X
        self.y_train = y
        
        if sample_weights is not None:
            w_sqrt = np.sqrt(sample_weights)
            self.X_train = self.X_train * w_sqrt[:, None]
            self.y_train = self.y_train * w_sqrt
        
        A = self.kernel_function_(X,X,**self.kernel_parameters)
        A[np.diag_indices_from(A)] = np.add(A[np.diag_indices_from(A)],n*self.lambd)
        # self.alpha = (K + n lambda I)^-1 y
        self.alpha = np.linalg.solve(A , self.y_train)

        return self
    
    def decision_function(self, X):
        K_x = self.kernel_function_(X,self.X_train, **self.kernel_parameters)
        return K_x.dot(self.alpha)
    
    def predict(self, X):
        return self.decision_function(X)

In [107]:
def cross_validate(x_data,y_data,kernel=None,lambd=0.2,sigma=0.5,k=5,p=None):
    if len(x_data)%k != 0:
        print('cant vsplit',len(x_data),' by ',k)
        return
    
    x_data_splitted = np.vsplit(x_data,k)
    y_data_splitted = np.vsplit(y_data.reshape(-1,1),k)
    
    aggrigate_result = []
    for i in range(len(x_data_splitted)):
        train = []
        test = []
        items = [j for j in range(len(x_data_splitted)) if j !=i ]
        x_test = x_data_splitted[i]
        y_test = y_data_splitted[i]
        for item in items:
            if len(train) == 0:
                x_train = x_data_splitted[item]
                y_train = y_data_splitted[item]
            else:
                x_train = np.concatenate((x_train,x_data_splitted[item]), axis=0)
                y_train = np.concatenate((y_train,y_data_splitted[item]), axis=0)
            
            
        model = KernelRidgeRegression(
                kernel=kernel,
                lambd=lambd,
                sigma=sigma
            ).fit(x_train, y_train)
        result = sum(np.sign(model.predict(x_test))==y_test)/len(y_test)
        aggrigate_result.append(result)
        
        value = sum(aggrigate_result)/len(aggrigate_result)
    return value

In [109]:
# cross_validate(X_preprocess_train, y,kernel='',lambd=0.001,k=4,sigma=0.2)

In [116]:
# Count Vectorizer 
def get_n_grams(data1):
    cv = CountVectorizer(analyzer='char',ngram_range=(2,2))
    X = cv.fit_transform(data1).toarray()
    return X

X_preprocess = get_n_grams(X_train_.flatten())
X_preprocess.shape

(2000, 16)

In [117]:
np.array(X_list).shape

(2000, 101)

In [118]:
def objective(trial):
    lambd = trial.suggest_loguniform('lambd', 1e-3, 1)
    sigma = trial.suggest_loguniform('sigma', 1e-3, 1)
    k =  trial.suggest_categorical('k', [4,5,8,10])
    p =  trial.suggest_int('p', 2,5)
    kernel =  trial.suggest_categorical('kernel', ['linear','rbf','quadratic'])
    
    return cross_validate(X_preprocess, y.values,kernel=kernel,lambd=lambd,k=4,sigma=sigma)

In [119]:
# cross_validate(X_train_mat100, y,lamda=0.01,k=4)
import optuna

sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
df = study.optimize(func=objective, n_trials=200,show_progress_bar=True)



HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

[I 2020-05-27 16:05:30,829] Finished trial#0 with value: 0.501 with parameters: {'lambd': 0.6788678496366254, 'sigma': 0.00411658973562302, 'k': 5, 'p': 4, 'kernel': 'rbf'}. Best is trial#0 with value: 0.501.
[I 2020-05-27 16:05:31,180] Finished trial#1 with value: 0.501 with parameters: {'lambd': 0.0014996022869998345, 'sigma': 0.01216365965492881, 'k': 8, 'p': 2, 'kernel': 'rbf'}. Best is trial#0 with value: 0.501.
[I 2020-05-27 16:05:31,538] Finished trial#2 with value: 0.5135000000000001 with parameters: {'lambd': 0.6820659847107553, 'sigma': 0.230051993650724, 'k': 8, 'p': 3, 'kernel': 'rbf'}. Best is trial#2 with value: 0.5135000000000001.
[I 2020-05-27 16:05:31,996] Finished trial#3 with value: 0.499 with parameters: {'lambd': 0.045612012186555195, 'sigma': 0.01588475033514627, 'k': 4, 'p': 3, 'kernel': 'linear'}. Best is trial#2 with value: 0.5135000000000001.
[I 2020-05-27 16:05:32,496] Finished trial#4 with value: 0.494 with parameters: {'lambd': 0.002230690129533121, 'sigma'

[I 2020-05-27 16:05:56,466] Finished trial#71 with value: 0.502 with parameters: {'lambd': 0.01399229128279502, 'sigma': 0.29747252901325943, 'k': 10, 'p': 2, 'kernel': 'rbf'}. Best is trial#13 with value: 0.5155000000000001.
[I 2020-05-27 16:05:56,853] Finished trial#72 with value: 0.513 with parameters: {'lambd': 0.011657912918665117, 'sigma': 0.26325053010347005, 'k': 10, 'p': 2, 'kernel': 'rbf'}. Best is trial#13 with value: 0.5155000000000001.
[I 2020-05-27 16:05:57,199] Finished trial#73 with value: 0.497 with parameters: {'lambd': 0.00710289569975837, 'sigma': 0.20168405789230837, 'k': 10, 'p': 2, 'kernel': 'rbf'}. Best is trial#13 with value: 0.5155000000000001.
[I 2020-05-27 16:05:57,618] Finished trial#74 with value: 0.5145 with parameters: {'lambd': 0.01325051181591142, 'sigma': 0.24710342693128137, 'k': 10, 'p': 2, 'kernel': 'rbf'}. Best is trial#13 with value: 0.5155000000000001.
[I 2020-05-27 16:05:57,973] Finished trial#75 with value: 0.5035000000000001 with parameters: 

[I 2020-05-27 16:06:22,120] Finished trial#141 with value: 0.4975 with parameters: {'lambd': 0.017414809379686383, 'sigma': 0.1973289080819464, 'k': 8, 'p': 2, 'kernel': 'rbf'}. Best is trial#138 with value: 0.518.
[I 2020-05-27 16:06:22,516] Finished trial#142 with value: 0.5005 with parameters: {'lambd': 0.010985077188685426, 'sigma': 0.31262224770865243, 'k': 8, 'p': 2, 'kernel': 'rbf'}. Best is trial#138 with value: 0.518.
[I 2020-05-27 16:06:22,891] Finished trial#143 with value: 0.5125 with parameters: {'lambd': 0.014536752353473207, 'sigma': 0.24916147359670227, 'k': 8, 'p': 2, 'kernel': 'rbf'}. Best is trial#138 with value: 0.518.
[I 2020-05-27 16:06:23,252] Finished trial#144 with value: 0.4985 with parameters: {'lambd': 0.024426141342966213, 'sigma': 0.2184133465608059, 'k': 10, 'p': 2, 'kernel': 'rbf'}. Best is trial#138 with value: 0.518.
[I 2020-05-27 16:06:23,677] Finished trial#145 with value: 0.4975 with parameters: {'lambd': 0.018121298468190678, 'sigma': 0.37312646678

In [None]:
df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
df.sort_values(by=['value'])

In [1]:

X_test = pd.read_csv('../data/Xte.csv',sep=',',index_col=0).values
X_train = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0).values

X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0)

NameError: name 'pd' is not defined

In [2]:
def LevenshteinDistance(str1,str2):
    '''
    Compute the edit distance between str1 and str2
    Param: @(str1): (str) string 1 for the comparison
    @(str2): (str) string 2 for the comparison
    Return (int) distance
    '''
    len_s1 = len(str1) +1
    len_s2 = len(str2) +1
    m = np.zeros((len_s1,len_s2))
    for i in range(len_s1):
        m[i,0] = i
    
    for j in range(len_s2):
        m[0,j] = j
    
    for i in range(1,len_s1):
        for j in range(1,len_s2):
            if str1[i-1]==str2[j-1]:
                m[i,j]= min(m[i-1,j]+1,m[i,j-1]+1,m[i-1,j-1])
            else:
                m[i,j] =min(m[i-1,j]+1,m[i,j-1]+1,m[i-1,j-1]+1)
    return m[-1,-1]

In [6]:
import numpy as np
LevenshteinDistance('acgt','ggt')

2.0