In [1]:
!pip install optuna -q

In [24]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

import optuna
# from tqdm import tqdm

In [3]:
X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values

X_test_ = pd.read_csv('../data/Xte.csv',sep=',',index_col=0)
X_train_ = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0)

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0)

train_data = pd.concat([X_train_ , y],axis=1)

In [4]:
print('x_train: {} y_train {}'.format(X_train_mat100.shape,y.shape))
print('x_test: {}'.format(X_test_mat100.shape))


x_train: (2000, 100) y_train (2000, 1)
x_test: (1000, 100)


In [5]:
def get_train_test(X,y,p):
    X = scale(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=p, random_state=42)
    print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)
    return X_train, X_test, y_train, y_test

### Model Test

In [51]:
class logisticregression():
    def __init__(self,train_data,train_labels,lamda=0.2,lr=0.01,decay=10,batch_size=64,epoch=10,print_every = 10):
        dummy_once = np.ones((len(train_data),1))
        self.train_data = np.hstack((dummy_once,train_data))
        self.train_labels = train_labels
        
        self.params = np.zeros((len(self.train_data[0]),1))
        
        self.lr = lr
        self.epoch = epoch
        self.batch_size = batch_size
        self.print_every = print_every
        self._lambda = lamda
        self.decay = decay
        
    def sigmoid(self,x):
        return 1/(1+np.exp(-x))
    
    def cost(self,y,y_pred):
        return -np.mean(y*np.log(y_pred)+(1-y)*np.log(1-y_pred))
    
    def gradient(self,y,y_pred,x):
#         hassien = np.dot(y_pred.T,(1-y_pred))*np.linalg.pinv(np.dot(x.T,x))
#         return np.dot(hassien,np.dot(x.T,(y_pred-y)))+(2*(self._lambda/len(y_pred))*self.params)
        return np.dot(x.T,(y_pred-y))+(2*(self._lambda/len(y_pred))*self.params)
    
    def train(self):
        for i in range(self.epoch):
            for j in range(len(self.train_labels)//self.batch_size):
                idx = list(np.random.choice(np.arange(len(self.train_labels)),self.batch_size,replace=False))
                data = self.train_data[idx]
                label = self.train_labels[idx]

                y_pred = self.sigmoid(np.dot(data,self.params))
                loss = self.cost(label,y_pred)

                gra = self.gradient(label,y_pred,data)
                self.params -= self.lr*gra

                self.lr *= (1. / (1. + self.decay * i))
            
            if self.print_every:
                if i%self.print_every == 0 or i == self.epoch-1:
                    print('Epoch : {}  Loss: {}'.format(i,loss))
    def predict(self,test_data):
        result = self.sigmoid(np.dot(test_data,self.params[1:])+self.params[0])
        result[result > 0.5 ] = 1
        result[result <= 0.5 ] = 0
        return result
    
    def evaluate(self,test_data,labels):
        accuracy = accuracy_score(self.predict(test_data),labels)
        return accuracy

In [52]:
def cross_validate(x_data,y_data,lr,lamda=0.2,epoch=10,k=4,decay=10):
    if len(x_data)%k != 0:
        print('cant vsplit',len(x_data),' by ',k)
        return
    
    x_data_splitted = np.vsplit(x_data,k)
    y_data_splitted = np.vsplit(y_data,k)
    
    aggrigate_result = []
    for i in range(len(x_data_splitted)):
        train = []
        test = []
        items = [j for j in range(len(x_data_splitted)) if j !=i ]
        x_test = x_data_splitted[i]
        y_test = y_data_splitted[i]
        for item in items:
            if len(train) == 0:
                x_train = x_data_splitted[item]
                y_train = y_data_splitted[item]
            else:
                x_train = np.concatenate((x_train,x_data_splitted[item]), axis=0)
                y_train = np.concatenate((y_train,y_data_splitted[item]), axis=0)
        
        logistic = logisticregression(x_train,y_train,lamda=lamda,lr=lr,decay=decay,epoch=epoch,print_every=None)
        logistic.train()
        
        result = logistic.evaluate(x_test,y_test)
        aggrigate_result.append(result)
        
        value = sum(aggrigate_result)/len(aggrigate_result)
    return value

# Optimize

In [62]:
def getKmers(sequence, size=6):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

train_data['words'] = train_data.seq.apply(lambda x: ' '.join(getKmers(x)))
X_test_['words'] = X_test_.seq.apply(lambda x: ' '.join(getKmers(x)))
train_data.head(2)

Unnamed: 0_level_0,seq,Bound,words
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,GAGGGGCTGGGGAGGGGGCTGGCCCAGAGGCACCAGACTCTGCAGA...,1,gagggg aggggc ggggct gggctg ggctgg gctggg ctgg...
1,CGGCCTGGGGGCCACATGTGAGTGCTTACCTGTGTGGGGATGAGGG...,0,cggcct ggcctg gcctgg cctggg ctgggg tggggg gggg...


In [67]:
from sklearn.feature_extraction.text import CountVectorizer


data = pd.DataFrame(pd.concat([train_data.words,X_test_.words],axis=0))

train_text = data.words.values

cv = CountVectorizer(ngram_range=(2,2),max_features=1500)
X = cv.fit_transform(train_text)
X = X.todense()


X.shape

(3000, 1500)

In [68]:
cross_validate(np.array(X)[:2000,:],y.values,k=5,lr=0.001,lamda=0.003,epoch=10)

0.6185

In [69]:
def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    lamda = trial.suggest_loguniform('lamda', 1e-7, 10)
    k =  trial.suggest_categorical('k', [4,5,8,10])
    epoch =  trial.suggest_int('epoch', 100, 500)
    decay = trial.suggest_int('decay', 3, 10)
    return cross_validate(np.array(X)[:2000,:], y.values,lr=lr,lamda=lamda,k=k,epoch=epoch,decay=decay)
# cross_validate(X_preprocess, y.reshape(-1,1),lr=0.001,epoch=200)

import optuna

sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(func=objective, n_trials=100,show_progress_bar=True)


Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



HBox(children=(FloatProgress(value=0.0), HTML(value='')))

 18%|█▊        | 18/100 [27:28<10:59,  8.05s/it]

[32m[I 2020-05-28 02:02:36,960][0m Finished trial#0 with value: 0.597 with parameters: {'lr': 0.003264943455983559, 'lamda': 2.166491137714555e-06, 'k': 4, 'epoch': 433, 'decay': 6}. Best is trial#0 with value: 0.597.[0m


 18%|█▊        | 18/100 [27:30<10:59,  8.05s/it]

[32m[I 2020-05-28 02:02:39,565][0m Finished trial#1 with value: 0.627 with parameters: {'lr': 3.205121962852147e-05, 'lamda': 1.9721768777920362e-07, 'k': 5, 'epoch': 193, 'decay': 5}. Best is trial#1 with value: 0.627.[0m


 18%|█▊        | 18/100 [27:36<10:59,  8.05s/it]

[32m[I 2020-05-28 02:02:45,559][0m Finished trial#2 with value: 0.6445 with parameters: {'lr': 0.030780337422447273, 'lamda': 8.447208519379117e-06, 'k': 4, 'epoch': 489, 'decay': 4}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [27:38<10:59,  8.05s/it]

[32m[I 2020-05-28 02:02:47,552][0m Finished trial#3 with value: 0.624 with parameters: {'lr': 0.03798225804652433, 'lamda': 8.62688150306835e-06, 'k': 4, 'epoch': 156, 'decay': 8}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [27:43<10:59,  8.05s/it]

[32m[I 2020-05-28 02:02:52,317][0m Finished trial#4 with value: 0.6165 with parameters: {'lr': 0.00026970934330640314, 'lamda': 8.454826678468012e-07, 'k': 5, 'epoch': 362, 'decay': 8}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [27:45<10:59,  8.05s/it]

[32m[I 2020-05-28 02:02:53,809][0m Finished trial#5 with value: 0.5575 with parameters: {'lr': 0.0008607522388082317, 'lamda': 1.4302312408345226, 'k': 8, 'epoch': 132, 'decay': 6}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [27:48<10:59,  8.05s/it]

[32m[I 2020-05-28 02:02:57,408][0m Finished trial#6 with value: 0.615 with parameters: {'lr': 0.038708256160730536, 'lamda': 0.00016933566235715464, 'k': 10, 'epoch': 276, 'decay': 7}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [27:51<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:00,185][0m Finished trial#7 with value: 0.639 with parameters: {'lr': 0.04033411048705575, 'lamda': 0.018380525160553513, 'k': 5, 'epoch': 210, 'decay': 10}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [27:56<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:04,844][0m Finished trial#8 with value: 0.576 with parameters: {'lr': 0.0003779024347609162, 'lamda': 0.13553065913694556, 'k': 5, 'epoch': 348, 'decay': 8}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [27:57<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:06,366][0m Finished trial#9 with value: 0.6130000000000001 with parameters: {'lr': 0.09082867280888432, 'lamda': 7.902290842931612e-06, 'k': 8, 'epoch': 137, 'decay': 3}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:03<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:12,335][0m Finished trial#10 with value: 0.6255000000000001 with parameters: {'lr': 0.0050768028612196625, 'lamda': 0.00040234061892359613, 'k': 4, 'epoch': 482, 'decay': 3}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:06<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:15,572][0m Finished trial#11 with value: 0.618 with parameters: {'lr': 0.012738017561790573, 'lamda': 0.026349012992634206, 'k': 5, 'epoch': 239, 'decay': 4}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:09<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:18,671][0m Finished trial#12 with value: 0.6020000000000001 with parameters: {'lr': 0.06772056838746823, 'lamda': 0.011946891413688092, 'k': 10, 'epoch': 228, 'decay': 10}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:14<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:22,829][0m Finished trial#13 with value: 0.6245 with parameters: {'lr': 0.011504537707992658, 'lamda': 9.601097114416238, 'k': 4, 'epoch': 329, 'decay': 10}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:20<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:28,906][0m Finished trial#14 with value: 0.628 with parameters: {'lr': 0.014304689658271025, 'lamda': 4.6624713036297825e-05, 'k': 4, 'epoch': 494, 'decay': 4}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:25<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:34,506][0m Finished trial#15 with value: 0.6255 with parameters: {'lr': 0.09951465962284502, 'lamda': 0.0030833815377185546, 'k': 5, 'epoch': 427, 'decay': 9}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:29<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:38,106][0m Finished trial#16 with value: 0.58 with parameters: {'lr': 0.0026474404121113585, 'lamda': 0.9648240875509498, 'k': 4, 'epoch': 282, 'decay': 5}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:31<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:40,798][0m Finished trial#17 with value: 0.5824999999999999 with parameters: {'lr': 8.553731060996793e-05, 'lamda': 0.0017489503302272028, 'k': 10, 'epoch': 203, 'decay': 4}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:33<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:41,964][0m Finished trial#18 with value: 0.5900000000000001 with parameters: {'lr': 0.025733429218594707, 'lamda': 5.73812790844757e-05, 'k': 8, 'epoch': 100, 'decay': 7}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:38<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:47,286][0m Finished trial#19 with value: 0.6 with parameters: {'lr': 1.0728095644821046e-05, 'lamda': 1.2502242082977574e-07, 'k': 5, 'epoch': 407, 'decay': 9}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:41<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:50,597][0m Finished trial#20 with value: 0.6315 with parameters: {'lr': 0.007497576320613591, 'lamda': 0.0837114080309308, 'k': 5, 'epoch': 250, 'decay': 5}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:45<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:53,898][0m Finished trial#21 with value: 0.6175 with parameters: {'lr': 0.0064343385013833345, 'lamda': 0.06129870167339588, 'k': 5, 'epoch': 244, 'decay': 5}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:47<10:59,  8.05s/it]

[32m[I 2020-05-28 02:03:56,416][0m Finished trial#22 with value: 0.6305 with parameters: {'lr': 0.026953703583672026, 'lamda': 0.36847797558886985, 'k': 5, 'epoch': 184, 'decay': 4}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:51<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:00,676][0m Finished trial#23 with value: 0.631 with parameters: {'lr': 0.0021810750809660865, 'lamda': 0.008934367136270821, 'k': 5, 'epoch': 316, 'decay': 5}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:55<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:04,202][0m Finished trial#24 with value: 0.6315 with parameters: {'lr': 0.056046222182501264, 'lamda': 4.0599455732675604, 'k': 5, 'epoch': 263, 'decay': 3}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [28:58<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:07,652][0m Finished trial#25 with value: 0.616 with parameters: {'lr': 0.06706094829410109, 'lamda': 8.620379857631981, 'k': 4, 'epoch': 273, 'decay': 3}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [29:01<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:09,996][0m Finished trial#26 with value: 0.627 with parameters: {'lr': 0.0073944816824571685, 'lamda': 0.20990792776921863, 'k': 5, 'epoch': 171, 'decay': 6}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [29:05<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:14,746][0m Finished trial#27 with value: 0.6064999999999999 with parameters: {'lr': 0.02348848702811607, 'lamda': 1.7329808654365404, 'k': 10, 'epoch': 369, 'decay': 3}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [29:08<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:17,661][0m Finished trial#28 with value: 0.597 with parameters: {'lr': 0.0013117245405630483, 'lamda': 0.0055380210116871475, 'k': 4, 'epoch': 224, 'decay': 5}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [29:12<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:21,655][0m Finished trial#29 with value: 0.633 with parameters: {'lr': 0.04928746980226703, 'lamda': 0.0005548546874661947, 'k': 5, 'epoch': 299, 'decay': 4}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [29:16<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:25,765][0m Finished trial#30 with value: 0.5995 with parameters: {'lr': 0.09973695834779597, 'lamda': 0.0006843244468418176, 'k': 8, 'epoch': 390, 'decay': 6}. Best is trial#2 with value: 0.6445.[0m


 18%|█▊        | 18/100 [29:20<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:29,716][0m Finished trial#31 with value: 0.6479999999999999 with parameters: {'lr': 0.01701931901177836, 'lamda': 0.03660163253700573, 'k': 5, 'epoch': 296, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [29:25<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:33,852][0m Finished trial#32 with value: 0.6265 with parameters: {'lr': 0.03898659881257962, 'lamda': 0.03204775540690615, 'k': 5, 'epoch': 306, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [29:31<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:39,864][0m Finished trial#33 with value: 0.631 with parameters: {'lr': 0.018461926855354627, 'lamda': 6.046613082358771e-07, 'k': 5, 'epoch': 460, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [29:35<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:44,221][0m Finished trial#34 with value: 0.624 with parameters: {'lr': 0.042810018600335835, 'lamda': 5.452364504607166e-06, 'k': 5, 'epoch': 331, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [29:39<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:48,014][0m Finished trial#35 with value: 0.6275 with parameters: {'lr': 0.011030961797740588, 'lamda': 8.637417002966556e-05, 'k': 5, 'epoch': 284, 'decay': 7}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [29:41<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:50,673][0m Finished trial#36 with value: 0.626 with parameters: {'lr': 0.019490190034575246, 'lamda': 1.6203798650691462e-05, 'k': 4, 'epoch': 203, 'decay': 6}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [29:45<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:54,648][0m Finished trial#37 with value: 0.6199999999999999 with parameters: {'lr': 0.09891615206048383, 'lamda': 0.00036013033297483516, 'k': 5, 'epoch': 301, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [29:50<10:59,  8.05s/it]

[32m[I 2020-05-28 02:04:59,417][0m Finished trial#38 with value: 0.6114999999999999 with parameters: {'lr': 0.0038750198427461957, 'lamda': 0.0016158944753169083, 'k': 5, 'epoch': 364, 'decay': 3}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [29:52<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:01,297][0m Finished trial#39 with value: 0.6184999999999999 with parameters: {'lr': 0.030522771839213123, 'lamda': 0.024875285148661518, 'k': 4, 'epoch': 142, 'decay': 9}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [29:56<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:04,884][0m Finished trial#40 with value: 0.623 with parameters: {'lr': 0.0561166017506943, 'lamda': 2.814648004499813e-06, 'k': 8, 'epoch': 342, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [29:59<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:08,446][0m Finished trial#41 with value: 0.6315 with parameters: {'lr': 0.007081100688386311, 'lamda': 0.08844140346182397, 'k': 5, 'epoch': 259, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:02<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:11,307][0m Finished trial#42 with value: 0.6335 with parameters: {'lr': 0.009482324288319547, 'lamda': 0.47361454484698806, 'k': 5, 'epoch': 213, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:05<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:14,272][0m Finished trial#43 with value: 0.6255 with parameters: {'lr': 0.04134548872957705, 'lamda': 0.27585014648000816, 'k': 5, 'epoch': 220, 'decay': 3}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:07<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:16,550][0m Finished trial#44 with value: 0.6315 with parameters: {'lr': 0.01523813093348651, 'lamda': 0.013342215516174285, 'k': 5, 'epoch': 167, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:10<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:19,152][0m Finished trial#45 with value: 0.5625 with parameters: {'lr': 0.0003310414663092274, 'lamda': 0.693136942711636, 'k': 5, 'epoch': 191, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:14<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:22,919][0m Finished trial#46 with value: 0.581 with parameters: {'lr': 0.0006713658668742326, 'lamda': 5.342847205535451e-07, 'k': 10, 'epoch': 290, 'decay': 3}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:16<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:25,730][0m Finished trial#47 with value: 0.6135 with parameters: {'lr': 0.004020110402622458, 'lamda': 2.0986846080468292e-05, 'k': 4, 'epoch': 215, 'decay': 8}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:20<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:28,911][0m Finished trial#48 with value: 0.6165 with parameters: {'lr': 0.009936585699659906, 'lamda': 0.0001761995026308183, 'k': 5, 'epoch': 237, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:24<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:33,239][0m Finished trial#49 with value: 0.6355 with parameters: {'lr': 0.06657958762344081, 'lamda': 0.005287347756255521, 'k': 5, 'epoch': 317, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:25<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:34,666][0m Finished trial#50 with value: 0.632 with parameters: {'lr': 0.07160183051138165, 'lamda': 0.0032635971888946122, 'k': 4, 'epoch': 103, 'decay': 3}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:31<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:40,621][0m Finished trial#51 with value: 0.619 with parameters: {'lr': 0.03546031353138621, 'lamda': 0.04745385476204549, 'k': 5, 'epoch': 456, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:36<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:45,734][0m Finished trial#52 with value: 0.6185 with parameters: {'lr': 0.019504741625672357, 'lamda': 0.005589173418906024, 'k': 5, 'epoch': 389, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:41<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:49,915][0m Finished trial#53 with value: 0.6265 with parameters: {'lr': 0.05340048942663967, 'lamda': 0.0010531290218871554, 'k': 5, 'epoch': 315, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:42<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:51,586][0m Finished trial#54 with value: 0.61 with parameters: {'lr': 0.08159071358709007, 'lamda': 0.017640641673109893, 'k': 5, 'epoch': 120, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:44<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:53,716][0m Finished trial#55 with value: 0.637 with parameters: {'lr': 0.027221711851933977, 'lamda': 0.0025866805234710543, 'k': 5, 'epoch': 156, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:47<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:55,823][0m Finished trial#56 with value: 0.6225 with parameters: {'lr': 0.027190823593949778, 'lamda': 0.0024190956482512465, 'k': 5, 'epoch': 150, 'decay': 10}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:48<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:57,248][0m Finished trial#57 with value: 0.5825 with parameters: {'lr': 0.014008470048146487, 'lamda': 0.13511375679508214, 'k': 8, 'epoch': 122, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:50<10:59,  8.05s/it]

[32m[I 2020-05-28 02:05:59,600][0m Finished trial#58 with value: 0.6285000000000001 with parameters: {'lr': 0.00939318865763919, 'lamda': 0.00855148343220025, 'k': 5, 'epoch': 174, 'decay': 3}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:52<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:01,731][0m Finished trial#59 with value: 0.5885 with parameters: {'lr': 0.005486588809823162, 'lamda': 0.004064089182006921, 'k': 10, 'epoch': 160, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:55<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:04,553][0m Finished trial#60 with value: 0.624 with parameters: {'lr': 0.02219579337785008, 'lamda': 0.0012416264932966835, 'k': 5, 'epoch': 203, 'decay': 6}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [30:58<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:07,026][0m Finished trial#61 with value: 0.633 with parameters: {'lr': 0.050083755383545356, 'lamda': 0.0003061744813198215, 'k': 5, 'epoch': 182, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:01<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:10,553][0m Finished trial#62 with value: 0.6205 with parameters: {'lr': 0.03500703573234346, 'lamda': 0.5415806950721077, 'k': 5, 'epoch': 265, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:05<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:13,877][0m Finished trial#63 with value: 0.6155 with parameters: {'lr': 0.07431192208275875, 'lamda': 0.031857354684111557, 'k': 5, 'epoch': 249, 'decay': 3}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:11<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:20,436][0m Finished trial#64 with value: 0.6365000000000001 with parameters: {'lr': 0.01591018440146278, 'lamda': 0.007403530074451102, 'k': 5, 'epoch': 497, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:18<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:26,919][0m Finished trial#65 with value: 0.628 with parameters: {'lr': 0.015467574667830946, 'lamda': 2.0306148013021894, 'k': 5, 'epoch': 497, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:23<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:32,669][0m Finished trial#66 with value: 0.63 with parameters: {'lr': 0.0287985461196728, 'lamda': 0.008450249360844337, 'k': 4, 'epoch': 465, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:29<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:38,184][0m Finished trial#67 with value: 0.622 with parameters: {'lr': 0.002373324847145159, 'lamda': 0.019696344403259295, 'k': 5, 'epoch': 418, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:35<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:43,888][0m Finished trial#68 with value: 0.633 with parameters: {'lr': 0.000102938516784145, 'lamda': 0.154609720333609, 'k': 5, 'epoch': 438, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:41<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:50,118][0m Finished trial#69 with value: 0.627 with parameters: {'lr': 0.012792472678796584, 'lamda': 0.05285913973812496, 'k': 5, 'epoch': 480, 'decay': 3}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:47<10:59,  8.05s/it]

[32m[I 2020-05-28 02:06:56,049][0m Finished trial#70 with value: 0.6194999999999999 with parameters: {'lr': 0.008731128744691815, 'lamda': 0.007393094983540938, 'k': 4, 'epoch': 485, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:51<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:00,362][0m Finished trial#71 with value: 0.63 with parameters: {'lr': 0.019101732396924937, 'lamda': 0.0006023787257184286, 'k': 5, 'epoch': 330, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [31:56<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:04,925][0m Finished trial#72 with value: 0.6335 with parameters: {'lr': 0.04759482418428227, 'lamda': 0.00287609758419023, 'k': 5, 'epoch': 346, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:00<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:09,546][0m Finished trial#73 with value: 0.6309999999999999 with parameters: {'lr': 0.024908663557578963, 'lamda': 0.0022758781316174572, 'k': 5, 'epoch': 344, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:05<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:14,514][0m Finished trial#74 with value: 0.628 with parameters: {'lr': 0.04054083044746396, 'lamda': 0.015054800600636624, 'k': 5, 'epoch': 379, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:10<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:19,246][0m Finished trial#75 with value: 0.6289999999999999 with parameters: {'lr': 0.09876754834460008, 'lamda': 0.004319527454038796, 'k': 5, 'epoch': 357, 'decay': 3}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:13<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:22,357][0m Finished trial#76 with value: 0.632 with parameters: {'lr': 0.05705007582495783, 'lamda': 0.0016051181420676302, 'k': 5, 'epoch': 232, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:16<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:25,681][0m Finished trial#77 with value: 0.6110000000000001 with parameters: {'lr': 0.033981578788864125, 'lamda': 0.08981797054382491, 'k': 8, 'epoch': 315, 'decay': 3}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:20<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:29,303][0m Finished trial#78 with value: 0.6359999999999999 with parameters: {'lr': 0.017289961050002384, 'lamda': 0.0001932566899297945, 'k': 5, 'epoch': 272, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:24<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:32,922][0m Finished trial#79 with value: 0.5915 with parameters: {'lr': 0.005127265787735655, 'lamda': 1.7079186539223213e-06, 'k': 10, 'epoch': 279, 'decay': 7}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:28<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:36,813][0m Finished trial#80 with value: 0.6275000000000001 with parameters: {'lr': 0.01567051310691922, 'lamda': 0.00011030740446923722, 'k': 5, 'epoch': 294, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:31<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:40,400][0m Finished trial#81 with value: 0.623 with parameters: {'lr': 0.022844510519107575, 'lamda': 3.132555353177622e-05, 'k': 5, 'epoch': 266, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:34<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:43,242][0m Finished trial#82 with value: 0.632 with parameters: {'lr': 0.02976653941045158, 'lamda': 0.0008634159504548859, 'k': 5, 'epoch': 211, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:37<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:46,593][0m Finished trial#83 with value: 0.641 with parameters: {'lr': 0.06286294364699495, 'lamda': 0.012888817683454023, 'k': 5, 'epoch': 252, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:41<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:50,273][0m Finished trial#84 with value: 0.6140000000000001 with parameters: {'lr': 0.01776300537331865, 'lamda': 0.03363162710756317, 'k': 5, 'epoch': 273, 'decay': 6}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:45<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:54,333][0m Finished trial#85 with value: 0.627 with parameters: {'lr': 0.04703282269734156, 'lamda': 0.00024150128308170988, 'k': 5, 'epoch': 308, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:49<10:59,  8.05s/it]

[32m[I 2020-05-28 02:07:57,915][0m Finished trial#86 with value: 0.6285 with parameters: {'lr': 0.01129307861690035, 'lamda': 0.005994827950095073, 'k': 4, 'epoch': 287, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:52<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:01,268][0m Finished trial#87 with value: 0.6295 with parameters: {'lr': 0.06473998835718699, 'lamda': 0.0124111106372853, 'k': 5, 'epoch': 249, 'decay': 5}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [32:56<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:05,621][0m Finished trial#88 with value: 0.6285000000000001 with parameters: {'lr': 0.0935373825307126, 'lamda': 7.905976174167839e-06, 'k': 5, 'epoch': 327, 'decay': 4}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:00<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:09,069][0m Finished trial#89 with value: 0.6300000000000001 with parameters: {'lr': 0.008557887981673855, 'lamda': 2.683471724751493e-07, 'k': 5, 'epoch': 257, 'decay': 6}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:02<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:11,580][0m Finished trial#90 with value: 0.634 with parameters: {'lr': 0.02238977012042056, 'lamda': 0.03955770479158774, 'k': 4, 'epoch': 192, 'decay': 8}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:05<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:14,089][0m Finished trial#91 with value: 0.627 with parameters: {'lr': 0.02169010422275982, 'lamda': 0.04242014998654765, 'k': 4, 'epoch': 194, 'decay': 8}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:08<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:17,002][0m Finished trial#92 with value: 0.635 with parameters: {'lr': 0.031212561163854388, 'lamda': 0.021219853375225466, 'k': 4, 'epoch': 227, 'decay': 10}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:11<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:19,978][0m Finished trial#93 with value: 0.628 with parameters: {'lr': 0.031368780690830315, 'lamda': 0.02423069749812372, 'k': 4, 'epoch': 230, 'decay': 9}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:14<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:22,894][0m Finished trial#94 with value: 0.61 with parameters: {'lr': 0.0658256826592812, 'lamda': 0.06826110211179146, 'k': 4, 'epoch': 223, 'decay': 10}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:17<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:25,970][0m Finished trial#95 with value: 0.6335000000000001 with parameters: {'lr': 0.04049368152715293, 'lamda': 0.016180763812146094, 'k': 4, 'epoch': 240, 'decay': 10}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:19<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:28,571][0m Finished trial#96 with value: 0.622 with parameters: {'lr': 0.02489351559814374, 'lamda': 0.009637106935574773, 'k': 4, 'epoch': 199, 'decay': 10}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:22<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:30,926][0m Finished trial#97 with value: 0.6225 with parameters: {'lr': 0.012913871705964561, 'lamda': 0.00478324854262827, 'k': 4, 'epoch': 181, 'decay': 9}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:24<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:32,968][0m Finished trial#98 with value: 0.623 with parameters: {'lr': 0.016876388362920467, 'lamda': 0.09207144879493583, 'k': 4, 'epoch': 154, 'decay': 9}. Best is trial#31 with value: 0.6479999999999999.[0m


 18%|█▊        | 18/100 [33:26<10:59,  8.05s/it]

[32m[I 2020-05-28 02:08:34,809][0m Finished trial#99 with value: 0.6175 with parameters: {'lr': 0.08240156336945363, 'lamda': 0.009234855306404895, 'k': 4, 'epoch': 140, 'decay': 7}. Best is trial#31 with value: 0.6479999999999999.[0m



In [70]:
df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
df.sort_values(by=['value'])

Unnamed: 0,number,value,duration,params_decay,params_epoch,params_k,params_lamda,params_lr
5,5,0.5575,00:00:01.477225,6,132,8,1.430231e+00,0.000861
45,45,0.5625,00:00:02.591529,4,191,5,6.931369e-01,0.000331
8,8,0.5760,00:00:04.649339,8,348,5,1.355307e-01,0.000378
16,16,0.5800,00:00:03.592939,5,282,4,9.648241e-01,0.002647
46,46,0.5810,00:00:03.758092,3,290,10,5.342847e-07,0.000671
...,...,...,...,...,...,...,...,...
55,55,0.6370,00:00:02.122079,4,156,5,2.586681e-03,0.027222
7,7,0.6390,00:00:02.769047,10,210,5,1.838053e-02,0.040334
83,83,0.6410,00:00:03.340957,5,252,5,1.288882e-02,0.062863
2,2,0.6445,00:00:05.986411,4,489,4,8.447209e-06,0.030780


In [50]:
lamda=3.660163e-02,lr=0.017019,decay=4,epoch=300,print_every=None


list_in = np.array(list(X_train_.flatten())+list(X_test_.flatten()))
list_in.astype(type(X_train_))
list_in = list_in.reshape(-1,1)
list_in.shape

(3000, 1)

In [48]:
#DNA sequence as a “language”, known as k-mer counting
def getKmers(sequence, size):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]
def get_n_grams(data1,n):
    X_train = []
    X_test = []

    cv = CountVectorizer(analyzer='char',ngram_range=(n,n))
    for i in data1:
        sentence = ' '.join(getKmers(i[0], size=n))
        X_train.append(sentence)
        
    X_cocat = X_train
    X = cv.fit_transform(X_cocat).toarray()
    return X

X_preprocess = get_n_grams(X_train_,7)

# def objective(trial):
#     lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
#     lamda = trial.suggest_loguniform('lamda', 0.01, 0.5)
#     k =  trial.suggest_categorical('k', [4,5,8,10])
#     epoch =  trial.suggest_int('epoch', 10, 20)
#     decay = trial.suggest_int('decay', 3, 10)
#     return cross_validate(X_preprocess[:2000,:], y,lr=lr,lamda=lamda,k=k,epoch=epoch,decay=decay)

cross_validate(X_preprocess[:2000], y,0.001,20,k=5,epoch=10,decay=10)

# import optuna

# sampler = optuna.samplers.TPESampler()
# study = optuna.create_study(sampler=sampler, direction='maximize')
# study.optimize(func=objective, n_trials=100,show_progress_bar=True)

0.924

In [13]:
# df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
# df.sort_values(by=['value'])

In [91]:
# # Count Vectorizer 
# def get_count_grams(data1,n):
#     cv = CountVectorizer(analyzer='char',ngram_range=(n,n))
#     X = cv.fit_transform(data1).toarray()
#     return X

# X_preprocess = get_n_grams(X_train_.flatten(),8)

# # def objective(trial):
# #     lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
# #     lamda = trial.suggest_loguniform('lamda', 0.01, 0.5)
# #     k =  trial.suggest_categorical('k', [4,5,8,10])
# #     epoch =  trial.suggest_int('epoch', 10, 20)
# #     decay = trial.suggest_int('decay', 3, 10)
# #     return cross_validate(X_preprocess, y,lr=lr,lamda=lamda,k=k,epoch=epoch,decay=decay)

# cross_validate(X_preprocess[:2000], y,0.0001,20,k=5,epoch=10,decay=10)

# # import optuna

# # sampler = optuna.samplers.TPESampler()
# # study = optuna.create_study(sampler=sampler, direction='maximize')
# # study.optimize(func=objective, n_trials=200,show_progress_bar=True)

In [201]:
# df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
# df.sort_values(by=['value'])

In [202]:
# cross_validate(X_preprocess, y,lr=0.004433,lamda=0.432127,k=4,epoch=16,decay=4)

In [66]:
# Count Vectorizer 
def get_tf_idf_grams(data1,n):
    cv = TfidfVectorizer(analyzer='char',ngram_range=(n,n))
    X = cv.fit_transform(data1).toarray()
    return X

X_preprocess = get_tf_idf_grams(X_train_.flatten(),8)

# def objective(trial):
#     lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
#     lamda = trial.suggest_loguniform('lamda', 0.01, 0.5)
#     k =  trial.suggest_categorical('k', [4,5,8,10])
#     epoch =  trial.suggest_int('epoch', 10, 20)
#     decay = trial.suggest_int('decay', 3, 10)
#     return cross_validate(X_preprocess, y,lr=lr,lamda=lamda,k=k,epoch=epoch,decay=decay)

cross_validate(X_preprocess[:2000], y,0.001,20,k=5,epoch=10,decay=10)

# import optuna

# sampler = optuna.samplers.TPESampler()
# study = optuna.create_study(sampler=sampler, direction='maximize')
# study.optimize(func=objective, n_trials=100,show_progress_bar=True)

0.5

In [213]:
# df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)
# df.sort_values(by=['value'])

# After testing all possible dataset preprocessing type now lets stick to one

In [None]:
X_train, X_test, y_train, y_test = get_train_test(X_preprocess[:2000],y,0.3)


In [80]:
# X_preprocess = get_n_grams(X_train_,8)
# X_preprocess.shape

# print(cross_validate(X_preprocess[:2000,:], y,lr=0.001,lamda=15,k=4,epoch=16,decay=10))

X_train, X_test, y_train, y_test = get_train_test(np.array(X)[:2000,:],y.values,0.01)

# y,0.001,15,k=5,epoch=10,decay=10)

logistic = logisticregression(X_train,y_train,lamda=3.660163e-02,lr=0.017019,decay=4,epoch=300,print_every=None)
logistic.train()
        
print(logistic.evaluate(X_train,y_train))
print(logistic.evaluate(X_test,y_test))
cross_validate(np.array(X)[:2000,:], y.values,lamda=3.660163e-02,lr=0.017019,decay=4,epoch=300)

(1980, 1500) (20, 1500) (1980, 1) (20, 1)



divide by zero encountered in log


invalid value encountered in multiply



0.7843434343434343
0.5


0.639

In [235]:
X_preprocess = get_count_grams(np.vstack((X_train_,X_test_)).flatten(),6)
X_preprocess.shape

cross_validate(X_preprocess[:2000,:], y,lr=0.004433,lamda=0.432127,k=4,epoch=16,decay=4)

C_count_6 = X_preprocess[2000:,:]

X_train, X_test, y_train, y_test = get_train_test(X_preprocess[:2000,:],y,0.3)


logistic_count6 = logisticregression(X_train,y_train,lamda=0.455265,epoch=10,print_every=1,lr=0.000407,decay=11)
logistic_count6.train()
        
print(logistic_count6.evaluate(X_train,y_train))
print(logistic_count6.evaluate(X_test,y_test))

(1400, 4096) (600, 4096) (1400, 1) (600, 1)
Epoch : 0  Loss: 0.6931471805599454
Epoch : 1  Loss: 0.369619629360289
Epoch : 2  Loss: 0.24093457515641262
Epoch : 3  Loss: 0.23515736618853034
Epoch : 4  Loss: 0.2349297795537864
Epoch : 5  Loss: 0.23492310904273742
Epoch : 6  Loss: 0.23492296082415293
Epoch : 7  Loss: 0.23492295817739842
Epoch : 8  Loss: 0.23492295813789463
Epoch : 9  Loss: 0.2349229581373882
0.9664285714285714
0.6433333333333333


In [236]:
X_preprocess = get_count_grams(np.vstack((X_train_,X_test_)).flatten(),4)
X_preprocess.shape

print(cross_validate(X_preprocess[:2000,:], y,lr=0.004433,lamda=0.432127,k=4,epoch=16,decay=7))

C_count_4 = X_preprocess[2000:,:]

X_train, X_test, y_train, y_test = get_train_test(X_preprocess[:2000,:],y,0.3)


logistic_count4 = logisticregression(X_train,y_train,lamda=0.455265,epoch=10,print_every=1,lr=0.000407,decay=11)
logistic_count4.train()
        
print(logistic_count4.evaluate(X_train,y_train))
print(logistic_count4.evaluate(X_test,y_test))

(1400, 256) (600, 256) (1400, 1) (600, 1)
Epoch : 0  Loss: 0.6931471805599454
Epoch : 1  Loss: 0.6286425932017687
Epoch : 2  Loss: 0.6042153578172307
Epoch : 3  Loss: 0.6027152714606171
Epoch : 4  Loss: 0.6026532220481413
Epoch : 5  Loss: 0.6026514005159469
Epoch : 6  Loss: 0.6026513600396904
Epoch : 7  Loss: 0.6026513593169011
Epoch : 8  Loss: 0.6026513593061132
Epoch : 9  Loss: 0.6026513593059748
0.6907142857142857
0.59



divide by zero encountered in log


invalid value encountered in multiply



In [237]:
X_preprocess = get_tf_idf_grams(np.vstack((X_train_,X_test_)).flatten(),4)
X_preprocess.shape

cross_validate(X_preprocess[:2000,:], y,lr=0.004433,lamda=0.432127,k=4,epoch=16,decay=4)

C_tf_4 = X_preprocess[2000:,:]


X_train, X_test, y_train, y_test = get_train_test(X_preprocess[:2000,:],y,0.3)


logistic_tf4 = logisticregression(X_train,y_train,lamda=0.455265,epoch=10,print_every=1,lr=0.000407,decay=11)
logistic_tf4.train()
        
print(logistic_tf4.evaluate(X_train,y_train))
print(logistic_tf4.evaluate(X_test,y_test))

(1400, 256) (600, 256) (1400, 1) (600, 1)
Epoch : 0  Loss: 0.6931471805599454
Epoch : 1  Loss: 0.6260173199159424
Epoch : 2  Loss: 0.6002614260404124
Epoch : 3  Loss: 0.598726394674755
Epoch : 4  Loss: 0.5986622185693936
Epoch : 5  Loss: 0.5986603339307318
Epoch : 6  Loss: 0.5986602920517511
Epoch : 7  Loss: 0.598660291303913
Epoch : 8  Loss: 0.5986602912927511
Epoch : 9  Loss: 0.5986602912926081
0.7028571428571428
0.5966666666666667


In [238]:
X_preprocess = get_tf_idf_grams(np.vstack((X_train_,X_test_)).flatten(),6)
X_preprocess.shape

cross_validate(X_preprocess[:2000,:], y,lr=0.004433,lamda=0.432127,k=4,epoch=16,decay=7)

C_tf_6 = X_preprocess[2000:,:]

X_train, X_test, y_train, y_test = get_train_test(X_preprocess[:2000,:],y,0.3)


logistic_tf6 = logisticregression(X_train,y_train,lamda=0.455265,epoch=10,print_every=1,lr=0.000407,decay=11)
logistic_tf6.train()
        
print(logistic_tf6.evaluate(X_train,y_train))
print(logistic_tf6.evaluate(X_test,y_test))

(1400, 4096) (600, 4096) (1400, 1) (600, 1)
Epoch : 0  Loss: 0.6931471805599454
Epoch : 1  Loss: 0.3746954683721624
Epoch : 2  Loss: 0.23994925474338094
Epoch : 3  Loss: 0.23422424550847226
Epoch : 4  Loss: 0.23399751150543982
Epoch : 5  Loss: 0.23399086485544413
Epoch : 6  Loss: 0.23399071716632397
Epoch : 7  Loss: 0.2339907145290239
Epoch : 8  Loss: 0.2339907144896612
Epoch : 9  Loss: 0.23399071448915654
0.9635714285714285
0.6433333333333333


In [15]:
X_test_final = scale(np.array(X)[:2000,:])

In [17]:
from collections import Counter


sumbission = []
for i in range(len(X_test_final)):
    r1 = logistic.predict(X_test_final[i])
#     r2 = logistic_count6.predict(C_count_6[i])
#     r3 = logistic_tf4.predict(C_tf_4[i])
#     r4 = logistic_tf6.predict(C_tf_6[i])
    
    
#     votes = [r1[0],r2[0],r3[0],r4[0]]
    
#     print(Counter(votes))
#     print(Counter(votes).most_common(1)[0][0])
    
#     break
#     sumbission.append([i,int(Counter(votes).most_common(1)[0][0])])
    
    sumbission.append([i,int(r1)])

In [20]:
# sumbission
df = pd.DataFrame(sumbission)
df.columns = ['Id','Bound']
df.to_csv('cv_64.9.csv',index=False)

In [19]:
df.head(50)

Unnamed: 0,Id,Bound
0,0,1
1,1,1
2,2,0
3,3,0
4,4,1
5,5,1
6,6,1
7,7,1
8,8,1
9,9,1
