In [1]:
!pip install optuna -q

In [2]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import optuna

In [3]:
np.random.seed(42)
random.seed(42)

In [4]:

X_test_ = pd.read_csv('../data/Xte.csv',sep=',',index_col=0)
X_train_ = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0)

X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0)

In [5]:
y['Bound'] = y.Bound.apply(lambda x: -1 if x == 0 else 1)
y.head()
y = y.Bound.values
y

array([ 1, -1,  1, ...,  1,  1,  1])

In [8]:

def get_train_test(X,y,p):
    X = X

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=p, random_state=42)
    print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

# Model Test

In [9]:
def rbf_kernel_element_wise(x, y, sigma=1):
    K =  np.exp(-np.sum((x-y)**2)/(2*sigma**2))
    return K

def rbf_kernel(X1, X2, sigma=10):
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def gaussian_kernel(x, y, sigma=5.0):
    return np.exp(-np.linalg.norm(x-y)**2 / (2 * (sigma ** 2)))

def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def polynomial_kernel(X1, X2, power=2):
    return np.power((1 + linear_kernel(X1, X2)),power)


def LevenshteinDistance(str1,str2):
    '''
    Compute the edit distance between str1 and str2
    Param: @(str1): (str) string 1 for the comparison
    @(str2): (str) string 2 for the comparison
    Return (int) distance
    '''
    len_s1 = len(str1) +1
    len_s2 = len(str2) +1
    m = np.zeros((len_s1,len_s2))
    for i in range(len_s1):
        m[i,0] = i
    
    for j in range(len_s2):
        m[0,j] = j
    
    for i in range(1,len_s1):
        for j in range(1,len_s2):
            if str1[i-1]==str2[j-1]:
                m[i,j]= min(m[i-1,j]+1,m[i,j-1]+1,m[i-1,j-1])
            else:
                m[i,j] =min(m[i-1,j]+1,m[i,j-1]+1,m[i-1,j-1]+1)
    return m[-1,-1]


def rbf_kernel(X1, X2, sigma=10):
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def linear_kernel(X1, X2):
    return X1.dot(X2.T)

In [10]:
class KernelMethodBase(object):
    '''
    Base class for kernel methods models
    
    Methods
    ----
    fit
    predict
    '''
    kernels_ = {
        'linear': linear_kernel,
        'polynomial': polynomial_kernel,
        'rbf': rbf_kernel,
        'gaussian':gaussian_kernel
    }
    def __init__(self, kernel='linear', **kwargs):
        self.kernel_name = kernel
        self.kernel_function_ = self.kernels_[kernel]
        self.kernel_parameters = self.get_kernel_parameters(**kwargs)
        
    def get_kernel_parameters(self, **kwargs):
        params = {}
        if self.kernel_name == 'rbf' or self.kernel_name == 'gaussian':
            params['sigma'] = kwargs.get('sigma', None)
        if self.kernel_name == 'polynomial':
            params['power'] = kwargs.get('power', None)
            
        
        return params

    def fit(self, X, y, **kwargs):
        return self
        
    def decision_function(self, X):
        pass

    def predict(self, X):
        pass

In [11]:
class KernelRidgeRegression(KernelMethodBase):
    '''
    Kernel Ridge Regression
    '''
    def __init__(self, lambd=0.1, **kwargs):
        self.lambd = lambd
        # Python 3: replace the following line by
        # super().__init__(**kwargs)
        super(KernelRidgeRegression, self).__init__(**kwargs)

    def fit(self, X, y, sample_weights=None):
        n, p = X.shape
        assert (n == len(y))
    
        self.X_train = X
        self.y_train = y
        
        if sample_weights is not None:
            w_sqrt = np.sqrt(sample_weights)
            self.X_train = self.X_train * w_sqrt[:, None]
            self.y_train = self.y_train * w_sqrt
        
        A = self.kernel_function_(X,X,**self.kernel_parameters)
        A[np.diag_indices_from(A)] = np.add(A[np.diag_indices_from(A)],n*self.lambd)
        # self.alpha = (K + n lambda I)^-1 y
        self.alpha = np.linalg.solve(A , self.y_train)

        return self
    
    def decision_function(self, X):
        K_x = self.kernel_function_(X,self.X_train, **self.kernel_parameters)
        return K_x.dot(self.alpha)
    
    def predict(self, X):
        return self.decision_function(X)

In [12]:
def cross_validate(x_data,y_data,kernel=None,lambd=0.2,sigma=0.5,k=5,power=2):
    if len(x_data)%k != 0:
        print('cant vsplit',len(x_data),' by ',k)
        return
    
    x_data_splitted = np.vsplit(x_data,k)
    y_data_splitted = np.vsplit(y_data.reshape(-1,1),k)
    
    aggrigate_result = []
    for i in range(len(x_data_splitted)):
        train = []
        test = []
        items = [j for j in range(len(x_data_splitted)) if j !=i ]
        x_test = x_data_splitted[i]
        y_test = y_data_splitted[i]
        for item in items:
            if len(train) == 0:
                x_train = x_data_splitted[item]
                y_train = y_data_splitted[item]
            else:
                x_train = np.concatenate((x_train,x_data_splitted[item]), axis=0)
                y_train = np.concatenate((y_train,y_data_splitted[item]), axis=0)
            
            
        model = KernelRidgeRegression(
                kernel=kernel,
                lambd=lambd,
                sigma=sigma,
                power=power
            ).fit(x_train, y_train)
        result = sum(np.sign(model.predict(x_test))==y_test)/len(y_test)
        aggrigate_result.append(result)
        
        value = sum(aggrigate_result)/len(aggrigate_result)
    return value

In [13]:
train_data = pd.concat([X_train_ , pd.DataFrame(y)],axis=1)

def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]



In [14]:
# from sklearn.feature_extraction.text import CountVectorizer

# train_data['words'] = train_data.seq.apply(lambda x: ' '.join(getKmers(x)))
# X_test_['words'] = X_test_.seq.apply(lambda x: ' '.join(getKmers(x)))
# train_data.shape

# data = pd.DataFrame(pd.concat([train_data.words,X_test_.words],axis=0))
# train_text = data.words.values

# cv = CountVectorizer(ngram_range=(2,2),max_features=1500,min_df=10,binary=True)
# X = cv.fit_transform(train_text)
# X = X.todense()


# X.shape

In [15]:
# cross_validate(np.array(X)[:2000,:],y.values,kernel='polynomial',lambd=0.001,k=4,sigma=0.2,power=5)

In [16]:
# def objective(trial):
#     lambd = trial.suggest_loguniform('lambd', 1e-7, 3)
#     sigma = trial.suggest_loguniform('sigma', 1e-7, 3)
#     k =  trial.suggest_categorical('k', [4,5,8,10])
#     power =  trial.suggest_int('power', 2,15)
#     kernel =  trial.suggest_categorical('kernel', ['linear','rbf','polynomial'])
    
#     return cross_validate(np.array(X)[:2000,:],y,kernel=kernel,lambd=lambd,k=4,sigma=sigma,power=power)


# # cross_validate(X_train_mat100, y,lamda=0.01,k=4)
# import optuna

# sampler = optuna.samplers.TPESampler()
# study = optuna.create_study(sampler=sampler, direction='maximize')
# df = study.optimize(func=objective, n_trials=1000,show_progress_bar=True)


# df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
# df.sort_values(by=['value'])

In [17]:
def base2int(c):
    return {'a':0,'c':1,'g':2,'t':3}.get(c,0)

def index(kmer):
    base_idx = np.array([base2int(base) for base in kmer])
    multiplier = 4** np.arange(len(kmer))
    kmer_idx = multiplier.dot(base_idx)
    return kmer_idx
    
    
def spectral_embedding(sequence,kmer_size=3):
    kmers = getKmers(sequence,kmer_size)
    kmer_idxs = [index(kmer) for kmer in kmers]
    one_hot_vector = np.zeros(4**kmer_size)
    
    for kmer_idx in kmer_idxs:
        one_hot_vector[kmer_idx] += 1
    return one_hot_vector


def get_data(kmer_size):
    data = pd.DataFrame(pd.concat([X_train_.seq,X_test_.seq],axis=0))
    train_text = data.seq.values
    # X_train_['kmers'] = X_train_.seq.apply(lambda x:list(spectral_embedding(x,kmer_size=3)))
    kmer_data = []
    for i in train_text:
        kmer_data.append(spectral_embedding(i,kmer_size=kmer_size))

    return np.array(kmer_data)

In [19]:
def objective(trial):
    lambd = trial.suggest_float('lambd', 1e-5, 100.0)
    sigma = trial.suggest_loguniform('sigma', 10, 150)
    k =  trial.suggest_categorical('k', [4,5,8])
    power =  trial.suggest_int('power', 1,5)
    kmer_size =  trial.suggest_int('kmer_size', 1,8)
    kernel =  trial.suggest_categorical('kernel', ['linear','polynomial'])
    
    return cross_validate(get_data(kmer_size)[:2000,:],y,kernel=kernel,lambd=lambd,k=k,sigma=sigma,power=power)


# cross_validate(X_train_mat100, y,lamda=0.01,k=4)
import optuna

sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
df = study.optimize(func=objective, n_trials=500,show_progress_bar=True)


Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))

[32m[I 2020-05-30 13:23:01,370][0m Finished trial#0 with value: 0.5005 with parameters: {'lambd': 75.1938446075039, 'sigma': 16.392892709968, 'k': 8, 'power': 5, 'kmer_size': 2, 'kernel': 'polynomial'}. Best is trial#0 with value: 0.5005.[0m
[32m[I 2020-05-30 13:23:03,631][0m Finished trial#1 with value: 0.506 with parameters: {'lambd': 74.89883184310443, 'sigma': 71.24686145943308, 'k': 8, 'power': 4, 'kmer_size': 3, 'kernel': 'linear'}. Best is trial#1 with value: 0.506.[0m
[32m[I 2020-05-30 13:23:06,244][0m Finished trial#2 with value: 0.5835 with parameters: {'lambd': 5.110117354471174, 'sigma': 41.161232470083206, 'k': 4, 'power': 1, 'kmer_size': 5, 'kernel': 'polynomial'}. Best is trial#2 with value: 0.5835.[0m
[32m[I 2020-05-30 13:23:08,817][0m Finished trial#3 with value: 0.55 with parameters: {'lambd': 80.27664631166901, 'sigma': 43.83339095598882, 'k': 4, 'power': 3, 'kmer_size': 5, 'kernel': 'linear'}. Best is trial#2 with value: 0.5835.[0m
[32m[I 2020-05-30 13:

In [22]:
df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
df.sort_values(by=['value'])

Unnamed: 0,number,value,duration,params_k,params_kernel,params_kmer_size,params_lambd,params_power,params_sigma
329,329,0.4955,00:00:02.024122,4,linear,1,0.014099,4,53.884549
57,57,0.4960,00:00:01.988776,4,linear,1,0.512914,5,100.211668
0,0,0.5005,00:00:02.258647,8,polynomial,2,75.193845,5,16.392893
1,1,0.5060,00:00:02.249841,8,linear,3,74.898832,4,71.246861
8,8,0.5205,00:00:02.146181,5,polynomial,1,83.124006,5,22.412224
...,...,...,...,...,...,...,...,...,...
106,106,0.6510,00:00:04.176095,4,linear,7,0.368218,3,121.384454
67,67,0.6510,00:00:04.167203,4,linear,7,0.454321,5,70.041000
66,66,0.6510,00:00:04.238394,4,linear,7,0.363969,4,89.678389
108,108,0.6515,00:00:04.294228,4,linear,7,0.747316,3,125.336845


In [32]:
X_train, X_test, y_train, y_test = get_train_test(get_data(7)[:2000,:],y,0.3)

(1400, 16384) (600, 16384) (1400,) (600,)


In [33]:
model = KernelRidgeRegression(
                kernel='linear',
                lambd=0.688381,
                sigma=93.801110,
                power=4
            ).fit(X_train, y_train)
result = sum(np.sign(model.predict(X_test))==y_test)/len(y_test)
result

0.6766666666666666

In [34]:
cross_validate(get_data(7)[:2000,:],y,kernel='linear',
                lambd=0.688381,
                sigma=93.801110,
                power=4,
                k = 4)

array([0.652])

In [35]:
X_test_final = get_data(7)[2000:,:]


sumbission = []
for i in range(len(X_test_final)):
    r1 = np.sign(model.predict(X_test_final[i]))
    
    if r1 == 1:
        sumbission.append([i,int(r1)])
    elif r1 == -1:
        sumbission.append([i,0])
    else:
        print('problem')
        
    
# sumbission
df = pd.DataFrame(sumbission)
df.columns = ['Id','Bound']
df.to_csv('cv_65.75_linear_overfitted.csv',index=False)

df.head(15)

Unnamed: 0,Id,Bound
0,0,1
1,1,0
2,2,0
3,3,0
4,4,1
5,5,1
6,6,1
7,7,1
8,8,1
9,9,1


In [118]:
df.shape

(1000, 2)