In [None]:
import numpy as np
import pandas as pd
from math import log


class LogisticRegressionBatchGd:
    def __init__(self, alpha=0.001, max_iter=1000, verbose=False, learning_rate='constant'):
        self.alpha = alpha
        self.max_iter = max_iter
        self.verbose = verbose
        self.learning_rate = learning_rate
        self.thetas = []
        self.loss = []

    def __sigmoid_(self, x):
        return 1 / (1 + np.exp(-x))

    def __vec_log_loss_(self, y_true, y_pred, m, eps=1e-15):
        if m > 1:
            return -1 / m * np.dot(np.ones((1, m)), ((y_true * np.log(y_pred+eps)) + (1 - y_true) * np.log(1- y_pred+eps)))[0]
        else:
            return -1 / m * np.dot(np.ones((1, m)), ((y_true * np.log(y_pred+eps)) + (1 - y_true) * np.log(1- y_pred+ eps)))[0][0]

    def __vec_log_gradient_(self, x, y_true, y_pred):
        return np.dot((y_pred - y_true), x)

    def fit(self, x_train, y_train):

        
        for iters in range(self.max_iter + 1):
            y_pred = self.predict(x_train)
            self.thetas -= self.alpha * self.__vec_log_gradient_(x_train, y_train, y_pred)
            loss = self.__vec_log_loss_(y_train, y_pred, len(y_train), eps=1e-15)
            if self.verbose == True and iters % 150 == 0:
                print('epoch {} : loss {}'.format(iters, loss))

    def predict(self, x_train):
        z = np.dot(x_train, self.thetas)
        return self.__sigmoid_(z)

    def score(self, x, y):
        y_pred = self.predict(x)
        score = 0
        for x,y in zip(y_pred,y):
            if x == y:
                score += 1
        return score / y_test.size


df_train = pd.read_csv('../resources//train_dataset_clean.csv', delimiter=',',
header=None, index_col=False)
x_train, y_train = np.array(df_train.iloc[:, 1:82]), df_train.iloc[:, 0]
df_test = pd.read_csv('../resources/test_dataset_clean.csv', delimiter=',', header=None,
index_col=False)
x_test, y_test = np.array(df_test.iloc[:, 1:82]), df_test.iloc[:, 0]
x_train = np.insert(x_train, 0, 1.0, axis=1)

x_test = np.insert(x_test, 0, 1.0, axis=1)
# We set our model with our hyperparameters : alpha, max_iter, verbose and learning_rate
model = LogisticRegressionBatchGd(alpha=0.001, max_iter=15000, verbose=True, learning_rate='constant')
# We fit our model to our dataset and display the score for the train and test datasets
model.thetas = np.ones(x_train.shape[1])
model.fit(x_train, y_train)
print(f'Score on train dataset : {model.score(x_train, y_train)}')
y_pred = model.predict(x_train)
print(y_pred.size)
print(y_test.size)
print(f'Score on test dataset : {(y_pred == y_test).mean()}')

epoch 0 : loss 2.711028065632692
epoch 150 : loss 0.935573616989867
epoch 300 : loss 0.9291594948612015
epoch 450 : loss 0.9268586737297158
epoch 600 : loss 0.9257248476307497
epoch 750 : loss 0.9250155938147973
epoch 900 : loss 0.9245111890003629
epoch 1050 : loss 0.9241277108985992
epoch 1200 : loss 0.9238243123279398
epoch 1350 : loss 0.9235774051604406
epoch 1500 : loss 0.9233719838154981
epoch 1650 : loss 0.9231979667181069
epoch 1800 : loss 0.9230483538800716
epoch 1950 : loss 0.9229181236337788
epoch 2100 : loss 0.9228036104725624
epoch 2250 : loss 0.9227020820487567
epoch 2400 : loss 0.9226114387531494
epoch 2550 : loss 0.9225300290078275
epoch 2700 : loss 0.9224565596310702
epoch 2850 : loss 0.9223899740098948
epoch 3000 : loss 0.9223293803180658
epoch 3150 : loss 0.9222740904727086
epoch 3300 : loss 0.9222234718722943
epoch 3450 : loss 0.9221770062453721
epoch 3600 : loss 0.9221342604079639
epoch 3750 : loss 0.9220948381943443
epoch 3900 : loss 0.9220584049297934
epoch 4050 :

In [65]:
np.count_nonzero(model._LogisticRegressionBatchGd__sigmoid_(np.dot(x_train, model.thetas)) == 0)

0

In [64]:
model._LogisticRegressionBatchGd__sigmoid_(np.dot(x_train, model.thetas))

array([0.99767515, 0.91583944, 0.95848459, ..., 0.99633604, 0.5857667 ,
       0.99925398])

In [60]:

np.count_nonzero(y_train == 1)

7841

In [27]:
x_train

array([[ 0.83710898,  1.13473876, -0.14592048, ...,  0.        ,
         0.        ,  0.        ],
       [-0.04264203, -0.42005962, -0.14592048, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.05704673, -1.19745882, -0.14592048, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.42360965, -0.42005962, -0.14592048, ...,  0.        ,
         0.        ,  0.        ],
       [-1.21564337, -0.42005962, -0.14592048, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.98373415, -0.42005962,  1.88842434, ...,  0.        ,
         0.        ,  0.        ]])