In [1]:
!pip install optuna -q

[K     |████████████████████████████████| 184kB 2.9MB/s eta 0:00:01
[K     |████████████████████████████████| 1.1MB 11.7MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 81kB 7.3MB/s  eta 0:00:01
[K     |████████████████████████████████| 81kB 6.4MB/s  eta 0:00:01
[K     |████████████████████████████████| 51kB 6.0MB/s  eta 0:00:01
[K     |████████████████████████████████| 112kB 16.2MB/s eta 0:00:01
[K     |████████████████████████████████| 61kB 6.6MB/s  eta 0:00:01
[?25h  Building wheel for alembic (PEP 517) ... [?25l[?25hdone
  Building wheel for optuna (setup.py) ... [?25l[?25hdone
  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [36]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score

import optuna

In [37]:
X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values
X_train_mat100 = pd.read_csv('../data/Xtr_mat100.csv',sep=' ',header=None).values


# X_test = pd.read_csv('../data/Xte.csv',sep=',',index_col=0).values
# X_train = pd.read_csv('../data/Xtr.csv',sep=',',index_col=0).values

y = pd.read_csv('../data/Ytr.csv',sep=',',index_col=0).values

In [38]:
print('x_train: {} y_train {}'.format(X_train_mat100.shape,y.shape))
# print('x_train: {} y_train {}'.format(X_train.shape,y.shape))
print('x_test: {}'.format(X_test_mat100.shape))

x_train: (2000, 100) y_train (2000, 1)
x_test: (1000, 100)


In [39]:
X_train_mat100 = scale(X_train_mat100)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_mat100, y, test_size=0.1, random_state=42)

print(X_train.shape,X_val.shape,y_train.shape, y_val.shape)

(1800, 100) (200, 100) (1800, 1) (200, 1)


### Model Test

In [44]:
class logisticregression():
    def __init__(self,train_data,train_labels,lamda=0.2,lr=0.01,decay=10,batch_size=None,epoch=10,print_every = 10):
        dummy_once = np.ones((len(train_data),1))
        self.train_data = np.hstack((dummy_once,train_data))
        self.train_labels = train_labels
        
        self.params = np.zeros((len(self.train_data[0]),1))
        
        self.lr = lr
        self.epoch = epoch
        self.batch_size = batch_size
        self.print_every = print_every
        self._lambda = lamda
        self.decay = decay
        
    def sigmoid(self,x):
        return 1/(1+np.exp(-x))
    
    def cost(self,y,y_pred):
        return -np.mean(y*np.log(y_pred)+(1-y)*np.log(1-y_pred))
    
    def gradient(self,y,y_pred,x):
        hassien = np.dot(y_pred.T,(1-y_pred))*np.linalg.inv(np.dot(x.T,x))
        return np.dot(hassien,np.dot(x.T,(y_pred-y)))+(2*self._lambda*self.params)
    
    def train(self):
        for i in range(self.epoch):
            y_pred = self.sigmoid(np.dot(self.train_data,self.params))
            loss = self.cost(self.train_labels,y_pred)
            
            gra = self.gradient(self.train_labels,y_pred,self.train_data)
            
            self.params -= self.lr*gra
            
            self.lr *= (1. / (1. + self.decay * i))
            
            if self.print_every:
                if i%self.print_every == 0 or i == self.epoch-1:
                    print('Epoch : {}  Loss: {}'.format(i,loss))
    def predict(self,test_data):
        result = self.sigmoid(np.dot(test_data,self.params[1:])+self.params[0])
        result[result > 0.5 ] = 1
        result[result <= 0.5 ] = 0
        return result
    
    def evaluate(self,test_data,labels):
        accuracy = accuracy_score(self.predict(test_data),labels)
        return accuracy

In [45]:
def cross_validate(x_data,y_data,lr,lamda=0.2,epoch=10,k=5,decay=10):
    if len(x_data)%k != 0:
        print('cant vsplit',len(x_data),' by ',k)
        return
    
    x_data_splitted = np.vsplit(x_data,k)
    y_data_splitted = np.vsplit(y_data,k)
    
    aggrigate_result = []
    for i in range(len(x_data_splitted)):
        train = []
        test = []
        items = [j for j in range(len(x_data_splitted)) if j !=i ]
        x_test = x_data_splitted[i]
        y_test = y_data_splitted[i]
        for item in items:
            if len(train) == 0:
                x_train = x_data_splitted[item]
                y_train = y_data_splitted[item]
            else:
                x_train = np.concatenate((x_train,x_data_splitted[item]), axis=0)
                y_train = np.concatenate((y_train,y_data_splitted[item]), axis=0)
            
        logistic = logisticregression(x_train,y_train,lamda=lamda,lr=lr,decay=decay,epoch=epoch,print_every=None)
        logistic.train()
        
        result = logistic.evaluate(x_test,y_test)
        aggrigate_result.append(result)
        
        value = sum(aggrigate_result)/len(aggrigate_result)
    return value if value!= None else 0

In [46]:
def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    lamda = trial.suggest_loguniform('lamda', 0.01, 0.5)
    k =  trial.suggest_categorical('k', [4,5,8,10])
    epoch =  trial.suggest_int('epoch', 10, 20)
    decay = trial.suggest_int('decay', 3, 10)
    return cross_validate(X_train_mat100, y,lr=lr,lamda=lamda,k=k,epoch=epoch,decay=decay)

In [47]:
# cross_validate(X_train_mat100, y,0.001,10)

import optuna

sampler = optuna.samplers.TPESampler()
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(func=objective, n_trials=100,show_progress_bar=True)


Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[32m[I 2020-05-25 21:13:41,553][0m Finished trial#0 with value: 0.549 with parameters: {'lr': 0.01799227840699826, 'lamda': 0.039554372946756323, 'k': 8, 'epoch': 11, 'decay': 6}. Best is trial#0 with value: 0.549.[0m
[32m[I 2020-05-25 21:13:41,787][0m Finished trial#1 with value: 0.5625 with parameters: {'lr': 6.295925826193235e-05, 'lamda': 0.06005922386526236, 'k': 4, 'epoch': 20, 'decay': 9}. Best is trial#1 with value: 0.5625.[0m
[32m[I 2020-05-25 21:13:42,022][0m Finished trial#2 with value: 0.5685 with parameters: {'lr': 0.0002100871699434587, 'lamda': 0.17148645254518677, 'k': 5, 'epoch': 16, 'decay': 10}. Best is trial#2 with value: 0.5685.[0m
[32m[I 2020-05-25 21:13:42,259][0m Finished trial#3 with value: 0.5675000000000001 with parameters: {'lr': 0.007266199858132005, 'lamda': 0.2919960688968443, 'k': 5, 'epoch': 18, 'decay': 3}. Best is trial#2 with value: 0.5685.[0m
[32m[I 2020-05-25 21:13:42,563][0m Finished trial#4 with value: 0.5495000000000001 with parame

In [17]:
df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete'], axis=1)

df.sort_values(by=['value'])

Unnamed: 0,number,value,duration,params_decay,params_epoch,params_k,params_lamda,params_lr
88,88,0.5000,00:00:00.160833,10,15,5,0.161073,0.093368
3,3,0.5005,00:00:00.138507,7,20,5,0.010428,0.049403
6,6,0.5005,00:00:00.129669,10,16,4,0.151531,0.041347
7,7,0.5055,00:00:00.130993,5,19,4,0.494667,0.034418
26,26,0.5070,00:00:00.135779,9,18,5,0.027088,0.012938
...,...,...,...,...,...,...,...,...
102,102,0.5955,00:00:00.137702,10,13,5,0.105406,0.000414
93,93,0.5955,00:00:00.128045,10,13,5,0.304025,0.000414
117,117,0.5955,00:00:00.134048,7,12,5,0.299451,0.000401
165,165,0.5960,00:00:00.149376,10,13,5,0.405698,0.000422


In [23]:
cross_validate(X_train_mat100,y,lamda=0.455265,epoch=50,lr=0.000407,decay=10)

0.595

In [49]:
# X_train, X_test, y_train, y_test
# logistic = logisticregression(X_train,y_train,lamda=0.362124,epoch=15,print_every=1,lr=0.000254)
#
logistic = logisticregression(X_train,y_train,lamda=0.455265,epoch=50,print_every=1,lr=0.000407,decay=11)
logistic.train()
        
print(logistic.evaluate(X_train,y_train))
print(logistic.evaluate(X_val,y_val))

Epoch : 0  Loss: 0.6931471805599452
Epoch : 1  Loss: 0.6867708235098253
Epoch : 2  Loss: 0.6809385491742812
Epoch : 3  Loss: 0.6804877037751971
Epoch : 4  Loss: 0.6804687383928599
Epoch : 5  Loss: 0.6804681813613453
Epoch : 6  Loss: 0.6804681690827427
Epoch : 7  Loss: 0.68046816886251
Epoch : 8  Loss: 0.6804681688590899
Epoch : 9  Loss: 0.6804681688590469
Epoch : 10  Loss: 0.6804681688590464
Epoch : 11  Loss: 0.6804681688590466
Epoch : 12  Loss: 0.6804681688590466
Epoch : 13  Loss: 0.6804681688590466
Epoch : 14  Loss: 0.6804681688590466
Epoch : 15  Loss: 0.6804681688590466
Epoch : 16  Loss: 0.6804681688590466
Epoch : 17  Loss: 0.6804681688590466
Epoch : 18  Loss: 0.6804681688590466
Epoch : 19  Loss: 0.6804681688590466
Epoch : 20  Loss: 0.6804681688590466
Epoch : 21  Loss: 0.6804681688590466
Epoch : 22  Loss: 0.6804681688590466
Epoch : 23  Loss: 0.6804681688590466
Epoch : 24  Loss: 0.6804681688590466
Epoch : 25  Loss: 0.6804681688590466
Epoch : 26  Loss: 0.6804681688590466
Epoch : 27  L

In [28]:
X_test_mat100 = pd.read_csv('../data/Xte_mat100.csv',sep=' ',header=None).values

X_test_mat100[0,:]

X_test_mat100 = scale(X_test_mat100)

In [29]:

sumbission = []
for i in range(len(X_test_mat100)):
    result = logistic.predict(X_test_mat100[i])
    sumbission.append([i,int(result)])
    result

In [30]:
# sumbission
df = pd.DataFrame(sumbission)
df.columns = ['Id','Bound']
df.to_csv('test_59.5_cross_validated.csv',index=False)

In [31]:
df.head(50)

Unnamed: 0,Id,Bound
0,0,1
1,1,0
2,2,0
3,3,0
4,4,0
5,5,1
6,6,0
7,7,1
8,8,1
9,9,0


# LR

In [55]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty = 'l2')
clf.fit(X_train,y_train)
clf.score(X_val,y_val)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



0.585