In [1]:
!pip install optuna -q

In [344]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import CountVectorizer

import optuna

In [345]:

X_test = pd.read_csv('../data/Xte.csv',sep=',',index_col=0).values
X_train = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0).values

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0).values

In [346]:
def getKmers(sequence, size):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

In [347]:
X_train.shape,X_test.shape

((2000, 1), (1000, 1))

In [348]:
X = []


cv = CountVectorizer()
for i in X_train:
    sentence = ' '.join(getKmers(i[0], size=2))
    X.append(sentence)
    
X_te = []


for i in X_test:
    sentence = ' '.join(getKmers(i[0], size=2))
    X_te.append(sentence)

X = cv.fit_transform(X+X_te).toarray()
# break
    

In [349]:
print('x_train: {} y_train {}'.format(X[:2000,:].shape,y.shape))

# print('x_train: {} y_train {}'.format(X_train.shape,y.shape))
# print('x_test: {}'.format(X_te.shape))

x_train: (2000, 16) y_train (2000, 1)


In [350]:
X_tr= scale(X[:2000,:])
X_te = scale(X[2000:,:])

X_train, X_val, y_train, y_val = train_test_split(
    X_tr, y, test_size=0.33, random_state=42)

print(X_train.shape,X_val.shape,y_train.shape, y_val.shape,X_test.shape)

(1340, 16) (660, 16) (1340, 1) (660, 1) (1000, 1)


### Model Test

In [308]:
class logisticregression():
    def __init__(self,train_data,train_labels,lamda=0.2,lr=0.01,decay=10,batch_size=None,epoch=10,print_every = 10):
        dummy_once = np.ones((len(train_data),1))
        self.train_data = np.hstack((dummy_once,train_data))
        self.train_labels = train_labels
        
        self.params = np.zeros((len(self.train_data[0]),1))
        
        self.lr = lr
        self.epoch = epoch
        self.batch_size = batch_size
        self.print_every = print_every
        self._lambda = lamda
        self.decay = decay
        
    def sigmoid(self,x):
        return 1/(1+np.exp(-x))
    
    def cost(self,y,y_pred):
        return -np.mean(y*np.log(y_pred)+(1-y)*np.log(1-y_pred))
    
    def gradient(self,y,y_pred,x):
        return np.dot(x.T,(y_pred-y))+(2*self._lambda*self.params)
    
    def train(self):
        for i in range(self.epoch):
            y_pred = self.sigmoid(np.dot(self.train_data,self.params))
            loss = self.cost(self.train_labels,y_pred)
            
            gra = self.gradient(self.train_labels,y_pred,self.train_data)
            
            self.params -= self.lr*gra
            
            self.lr *= (1. / (1. + self.decay * i))
            
            if self.print_every:
                if i%self.print_every == 0 or i == self.epoch-1:
                    print('Epoch : {}  Loss: {}'.format(i,loss))
    def predict(self,test_data):
        result = self.sigmoid(np.dot(test_data,self.params[1:])+self.params[0])
        result[result > 0.5 ] = 1
        result[result <= 0.5 ] = 0
        return result
    
    def evaluate(self,test_data,labels):
        accuracy = accuracy_score(self.predict(test_data),labels)
        return accuracy

In [309]:
def cross_validate(x_data,y_data,lr,lamda=0.2,epoch=10,k=5,decay=10):
    if len(x_data)%k != 0:
        print('cant vsplit',len(x_data),' by ',k)
        return
    
    x_data_splitted = np.vsplit(x_data,k)
    y_data_splitted = np.vsplit(y_data,k)
    
    aggrigate_result = []
    for i in range(len(x_data_splitted)):
        train = []
        test = []
        items = [j for j in range(len(x_data_splitted)) if j !=i ]
        x_test = x_data_splitted[i]
        y_test = y_data_splitted[i]
        for item in items:
            if len(train) == 0:
                x_train = x_data_splitted[item]
                y_train = y_data_splitted[item]
            else:
                x_train = np.concatenate((x_train,x_data_splitted[item]), axis=0)
                y_train = np.concatenate((y_train,y_data_splitted[item]), axis=0)
            
        logistic = logisticregression(x_train,y_train,lamda=lamda,lr=lr,decay=decay,epoch=epoch)
        logistic.train()
        
        result = logistic.evaluate(x_test,y_test)
        aggrigate_result.append(result)
        
        value = sum(aggrigate_result)/len(aggrigate_result)
    return value if value!= None else 0

In [310]:
def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    lamda = trial.suggest_loguniform('lamda', 0.01, 0.5)
    k =  trial.suggest_categorical('k', [4,5,8,10])
    epoch =  trial.suggest_int('epoch', 10, 20)
    decay = trial.suggest_int('decay', 3, 10)
    return cross_validate(X_tr, y,lr=lr,lamda=lamda,k=k,epoch=epoch,decay=decay)

In [311]:
# cross_validate(X_train_mat100, y,0.001,10)

import optuna

sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(func=objective, n_trials=200,show_progress_bar=True)


Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


divide by zero encountered in log


invalid value encountered in multiply



[32m[I 2020-05-25 20:47:33,395][0m Finished trial#0 with value: 0.6045 with parameters: {'lr': 0.01649253233594149, 'lamda': 0.36897164878015126, 'k': 10, 'epoch': 19, 'decay': 8}. Best is trial#0 with value: 0.6045.[0m
[32m[I 2020-05-25 20:47:33,595][0m Finished trial#1 with value: 0.6245 with parameters: {'lr': 1.7769572471647115e-05, 'lamda': 0.05278192749801375, 'k': 4, 'epoch': 16, 'decay': 8}. Best is trial#1 with value: 0.6245.[0m
[32m[I 2020-05-25 20:47:33,818][0m Finished trial#2 with value: 0.614 with parameters: {'lr': 2.0668987174148333e-05, 'lamda': 0.023056782989366083, 'k': 8, 'epoch': 20, 'decay': 9}. Best is trial#1 with value: 0.6245.[0m
[32m[I 2020-05-25 20:47:34,039][0m Finished trial#3 with value: 0.614 with parameters: {'lr': 3.874152905085708e-05, 'lamda': 0.010067497058614145, 'k': 8, 'epoch': 16, 'decay': 9}. Best is trial#1 with value: 0.6245.[0m
[32m[I 2020-05-25 20:47:34,230][0m Finished trial#4 with value: 0.6154999999999999 with parameters: {


overflow encountered in exp



[32m[I 2020-05-25 20:47:35,643][0m Finished trial#11 with value: 0.6255000000000001 with parameters: {'lr': 0.005502896969874503, 'lamda': 0.13820595968601468, 'k': 4, 'epoch': 10, 'decay': 6}. Best is trial#7 with value: 0.6275.[0m
[32m[I 2020-05-25 20:47:35,837][0m Finished trial#12 with value: 0.6234999999999999 with parameters: {'lr': 0.08821116894760933, 'lamda': 0.11398047120491855, 'k': 4, 'epoch': 13, 'decay': 6}. Best is trial#7 with value: 0.6275.[0m
[32m[I 2020-05-25 20:47:36,027][0m Finished trial#13 with value: 0.625 with parameters: {'lr': 0.003592286286081175, 'lamda': 0.09960611239538093, 'k': 4, 'epoch': 10, 'decay': 6}. Best is trial#7 with value: 0.6275.[0m
[32m[I 2020-05-25 20:47:36,247][0m Finished trial#14 with value: 0.6245 with parameters: {'lr': 0.00990923468470102, 'lamda': 0.2511488906110466, 'k': 4, 'epoch': 14, 'decay': 7}. Best is trial#7 with value: 0.6275.[0m
[32m[I 2020-05-25 20:47:36,433][0m Finished trial#15 with value: 0.615999999999999

In [312]:
df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)

df.sort_values(by=['value'])

Unnamed: 0,number,value,duration,params_decay,params_epoch,params_k,params_lamda,params_lr
0,0,0.6045,00:00:00.235365,8,19,10,0.368972,0.016493
29,29,0.6045,00:00:00.218716,7,17,10,0.089794,0.009743
8,8,0.6065,00:00:00.233999,10,19,10,0.013069,0.000233
179,179,0.6075,00:00:00.191209,8,10,10,0.034966,0.001303
57,57,0.6085,00:00:00.202923,6,11,10,0.019213,0.001156
...,...,...,...,...,...,...,...,...
97,97,0.6295,00:00:00.189959,6,10,4,0.022685,0.001447
95,95,0.6295,00:00:00.191692,6,10,4,0.040199,0.001438
122,122,0.6295,00:00:00.185681,6,10,4,0.024073,0.001434
83,83,0.6295,00:00:00.204314,7,11,4,0.010491,0.001443


In [314]:
cross_validate(X_tr,y,lamda=0.050740,epoch=10,lr=0.050740,k=4,decay=7)


divide by zero encountered in log


invalid value encountered in multiply



0.623

In [343]:
# X_train, X_test, y_train, y_test
# logistic = logisticregression(X_train,y_train,lamda=0.362124,epoch=15,print_every=1,lr=0.000254)
#
logistic = logisticregression(X_train,y_train,lamda=0.05,epoch=10,lr=0.001383,decay=7,print_every=1)
logistic.train()
        
print(logistic.evaluate(X_train,y_train))
print(logistic.evaluate(X_val,y_val))

Epoch : 0  Loss: 0.6931471805599453
Epoch : 1  Loss: 0.6614940106899204
Epoch : 2  Loss: 0.6576564995558486
Epoch : 3  Loss: 0.6530411387034815
Epoch : 4  Loss: 0.6529215147131578
Epoch : 5  Loss: 0.6529162816687935
Epoch : 6  Loss: 0.6529161015244107
Epoch : 7  Loss: 0.6529160965206913
Epoch : 8  Loss: 0.652916096404326
Epoch : 9  Loss: 0.6529160964019987
0.6313432835820896
0.55


In [317]:

X_test_mat100 = scale(X_te)

In [325]:

sumbission = []
for i in range(len(X_test_mat100)):
    result = logistic.predict(X_test_mat100[i])
    sumbission.append([i,int(result)])
    result

In [326]:
# sumbission
df = pd.DataFrame(sumbission)
df.columns = ['Id','Bound']
df.to_csv('test_64_cross_validated.csv',index=False)

In [327]:
df.head(50)

Unnamed: 0,Id,Bound
0,0,1
1,1,0
2,2,0
3,3,0
4,4,0
5,5,1
6,6,1
7,7,1
8,8,1
9,9,0


# SVM Test


In [340]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
clf = make_pipeline(StandardScaler(), SVC(kernel='poly'))
clf.fit(X_train,y_train)
clf.score(X_val,y_val)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



0.5787878787878787

In [341]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty = 'l2')
clf.fit(X_train,y_train)
clf.score(X_val,y_val)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



0.5545454545454546