In [1]:
!pip install cvxopt -q
!pip install optuna -q

In [2]:
import sklearn
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
import optuna
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, precision_score
from numpy import linalg
import cvxopt
import cvxopt.solvers
import sklearn

import os
cvxopt.solvers.options['show_progress'] = False

In [3]:

X_test_ = pd.read_csv('../data/Xte.csv',sep=',',index_col=0)
X_train_ = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0)

X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0)

In [4]:
y['Bound'] = y.Bound.apply(lambda x: -1 if x == 0 else 1)
y.head()
y = y.Bound.values
y

array([ 1, -1,  1, ...,  1,  1,  1])

In [5]:
def get_train_test(X,y,p):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=p, random_state=42)
    print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

def base2int(c):
    return {'a':0,'c':1,'g':2,'t':3}.get(c,0)

def index(kmer):
    base_idx = np.array([base2int(base) for base in kmer])
    multiplier = 4** np.arange(len(kmer))
    kmer_idx = multiplier.dot(base_idx)
    return kmer_idx
    
    
def spectral_embedding(sequence,kmer_size=3):
    kmers = getKmers(sequence,kmer_size)
    kmer_idxs = [index(kmer) for kmer in kmers]
    one_hot_vector = np.zeros(4**kmer_size)
    
    for kmer_idx in kmer_idxs:
        one_hot_vector[kmer_idx] += 1
    return one_hot_vector


def get_data(kmer_size):
    data = pd.DataFrame(pd.concat([X_train_.seq,X_test_.seq],axis=0))
    train_text = data.seq.values
    # X_train_['kmers'] = X_train_.seq.apply(lambda x:list(spectral_embedding(x,kmer_size=3)))
    kmer_data = []
    for i in train_text:
        kmer_data.append(spectral_embedding(i,kmer_size=kmer_size))

    return np.array(kmer_data)

In [6]:
def rbf_kernel_element_wise(x, y, sigma=1):
    K =  np.exp(-np.sum((x-y)**2)/(2*sigma**2))
    return K

def rbf_kernel(X1, X2, sigma=10):
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def gaussian_kernel(x, y, sigma=5.0):
    return np.exp(-np.linalg.norm(x-y)**2 / (2 * (sigma ** 2)))

def linear_kernel(x1, x2):
    return np.dot(x1, x2)

def polynomial_kernel(X1, X2, power=2):
    return np.power((1 + linear_kernel(X1, X2)),power)


def rbf_kernel(X1, X2, sigma=10):
    X2_norm = np.sum(X2 ** 2, axis = -1)
    X1_norm = np.sum(X1 ** 2, axis = -1)
    gamma = 1 / (2 * sigma ** 2)
    K = np.exp(- gamma * (X1_norm[:, None] + X2_norm[None, :] - 2 * np.dot(X1, X2.T)))
    return K

def sigma_from_median(X):
    pairwise_diff = X[:, :, None] - X[:, :, None].T
    pairwise_diff *= pairwise_diff
    euclidean_dist = np.sqrt(pairwise_diff.sum(axis=1))
    return np.median(euclidean_dist)

def linear_kernel(X1, X2):
    return X1.dot(X2.T)

In [7]:
class KernelMethodBase(object):
    '''
    Base class for kernel methods models
    
    Methods
    ----
    fit
    predict
    '''
    kernels_ = {
        'linear': linear_kernel,
        'polynomial': polynomial_kernel,
        'rbf': rbf_kernel,
        'gaussian':gaussian_kernel
    }
    def __init__(self, kernel='linear', **kwargs):
        self.kernel_name = kernel
        self.kernel_function_ = self.kernels_[kernel]
        self.kernel_parameters = self.get_kernel_parameters(**kwargs)
        
    def get_kernel_parameters(self, **kwargs):
        params = {}
        if self.kernel_name == 'rbf' or self.kernel_name == 'gaussian':
            params['sigma'] = kwargs.get('sigma', None)
        if self.kernel_name == 'polynomial':
            params['power'] = kwargs.get('power', None)
            
        
        return params

    def fit(self, X, y, **kwargs):
        return self
        
    def decision_function(self, X):
        pass

    def predict(self, X):
        pass

In [8]:
# def cvxopt_qp(P, q, G, h, A, b):
#     P = .5 * (P + P.T)
#     cvx_matrices = [
#         cvxopt.matrix(M) if M is not None else None for M in [P, q, G, h, A, b] 
#     ]
#     #cvxopt.solvers.options['show_progress'] = False
#     solution = cvxopt.solvers.qp(*cvx_matrices, options={'show_progress': False})
#     return np.array(solution['x']).flatten()

# solve_qp = cvxopt_qp


# def svm_dual_soft_to_qp_kernel(K, y, C=1):
#     n = K.shape[0]
#     assert (len(y) == n)
        
#     # Dual formulation, soft margin
#     # P = np.diag(y) @ K @ np.diag(y)
#     P = np.diag(y).dot(K).dot(np.diag(y))
#     # As a regularization, we add epsilon * identity to P
#     eps = 1e-12
#     P += eps * np.eye(n)
#     q = - np.ones(n)
#     G = np.vstack([-np.eye(n), np.eye(n)])
#     h = np.hstack([np.zeros(n), C * np.ones(n)])
#     A = y[np.newaxis, :]
#     b = np.array([0.])
#     return P, q, G, h, A.astype(float), b

# K = linear_kernel(X_train, X_train)
# alphas = solve_qp(*svm_dual_soft_to_qp_kernel(K, y_train, C=1.))


# class KernelSVM(KernelMethodBase):
#     '''
#     Kernel SVM Classification
    
#     Methods
#     ----
#     fit
#     predict
#     '''
#     def __init__(self, C=0.1, **kwargs):
#         self.C = C
#         # Python 3: replace the following line by
#         # super().__init__(**kwargs)
#         super(KernelSVM, self).__init__(**kwargs)

#     def fit(self, X, y, tol=1e-8):
#         n, p = X.shape
#         assert (n == len(y))
    
#         self.X_train = X
#         self.y_train = y
        
#         # Kernel matrix
#         K = self.kernel_function_(
#             self.X_train, self.X_train, **self.kernel_parameters)
        
#         # Solve dual problem
#         self.alpha = solve_qp(*svm_dual_soft_to_qp_kernel(K, y, C=self.C))
        
#         # Compute support vectors and bias b
#         sv = np.logical_and((self.alpha > tol), (self.C - self.alpha > tol))
#         self.bias = y[sv] - K[sv].dot(self.alpha * y)
#         self.bias = self.bias.mean()

#         self.support_vector_indices = np.nonzero(sv)[0]

#         return self
        
#     def decision_function(self, X):
#         K_x = self.kernel_function_(X, self.X_train, **self.kernel_parameters)
#         return K_x.dot(self.alpha * self.y_train) + self.bias

#     def predict(self, X):
#         return np.sign(self.decision_function(X))

In [15]:



class KernelSVM(KernelMethodBase):
    def __init__(self, C=0.1, **kwargs):
        self.C = C
        # Python 3: replace the following line by
        # super().__init__(**kwargs)
        super(KernelSVM, self).__init__(**kwargs)
        
    def cvxopt_qp(self,P, q, G, h, A, b):
        P = .5 * (P + P.T)
        cvx_matrices = [
            cvxopt.matrix(M) if M is not None else None for M in [P, q, G, h, A, b] 
        ]
        #cvxopt.solvers.options['show_progress'] = False
        solution = cvxopt.solvers.qp(*cvx_matrices, options={'show_progress': False})
        return np.array(solution['x']).flatten()
    
    def svm_dual_soft_to_qp_kernel(self,K, y, C=1):
        n = K.shape[0]
        assert (len(y) == n)

        # Dual formulation, soft margin
        # P = np.diag(y) @ K @ np.diag(y)
        P = np.diag(y).dot(K).dot(np.diag(y))
        # As a regularization, we add epsilon * identity to P
        eps = 1e-12
        P += eps * np.eye(n)
        q = - np.ones(n)
        G = np.vstack([-np.eye(n), np.eye(n)])
        h = np.hstack([np.zeros(n), C * np.ones(n)])
        A = y[np.newaxis, :]
        b = np.array([0.])
        return P, q, G, h, A.astype(float), b


    def fit(self, X, y, tol=1e-8):
        n, p = X.shape
        assert (n == len(y))
    
        self.X_train = X
        self.y_train = y
        
        # Kernel matrix
        K = self.kernel_function_(
            self.X_train, self.X_train, **self.kernel_parameters)
        
        # Solve dual problem
        self.alpha = self.cvxopt_qp(*self.svm_dual_soft_to_qp_kernel(K, y, C=self.C))
        
        # Compute support vectors and bias b
        sv = np.logical_and((self.alpha > tol), (self.C - self.alpha > tol))
        self.bias = y[sv] - K[sv].dot(self.alpha * y)
        self.bias = self.bias.mean()

        self.support_vector_indices = np.nonzero(sv)[0]

        return self
        
    def decision_function(self, X):
        K_x = self.kernel_function_(X, self.X_train, **self.kernel_parameters)
        return K_x.dot(self.alpha * self.y_train) + self.bias

    def predict(self, X):
        return np.sign(self.decision_function(X))
    
    
X_train, X_test, y_train, y_test = get_train_test(get_data(6)[:2000,:],y,p=0.3)


kernel = 'polynomial'
power = 2
sigma = 5.
C = 10.
model = KernelSVM(C=C, kernel=kernel, sigma=sigma, power=power)
y_pred = model.fit(X_train, y_train).predict(X_test)

sum(y_pred==y_test)/len(y_test)

(1400, 4096) (600, 4096) (1400,) (600,)


0.6583333333333333

(1400, 4096) (600, 4096) (1400,) (600,)


0.6583333333333333

In [77]:
y_train

array([-1,  1, -1, ..., -1,  1,  1])

In [14]:
type(y_train)

numpy.ndarray