In [82]:
import numpy as np
import pandas as pd

In [83]:
# hyperparams
my_seed = 881003
valid_ratio = 0.1
eps = 1e-8
min_clip_value = 1e-7
max_clip_value = 1 - 1e-7
X_train_fpath = './data/X_train'
y_train_fpath = './data/Y_train'
X_test_fpath = './data/X_test'
y_test_fpath = './output_{}.csv'

In [84]:
# set numpy seed
np.random.seed(my_seed)

In [85]:
# read datasets
X_train = pd.read_csv(X_train_fpath, index_col=['id'])
X_test = pd.read_csv(X_test_fpath, index_col=['id'])
y_train = pd.read_csv(y_train_fpath, index_col=['id']).to_numpy().astype('float64').flatten()

In [86]:
train_size = X_train.shape[0]

In [87]:
X = pd.concat([X_train, X_test])

In [88]:
count = X.nunique()
one = count[count == 1].index
two = count[count == 2].index
three_or_more = count[count >= 3].index

In [89]:
X = X.drop(columns=one)
X[three_or_more] = (X[three_or_more] - X[three_or_more].mean()) / X[three_or_more].std()

In [90]:
X_train, X_test = X.iloc[:train_size, :], X.iloc[train_size:, :]

In [91]:
X_train = X_train.to_numpy().astype('float64')
X_test = X_test.to_numpy().astype('float64')

In [72]:
# define some helper function as we can't use sklearn...
# not needed
def normalize(X, is_train=True, columns=None, X_mean=None, X_std=None):
    if columns is None:
        columns = np.arange(X.shape[1])
    if is_train:
        X_mean = np.mean(X[:, columns], axis=0).reshape(1, -1)
        X_std = np.std(X[:, columns], axis=0).reshape(1, -1)
    X[:, columns] = (X[:, columns] - X_mean) / (X_std + eps)
    return X, X_mean, X_std # X_mean, X_std: (1 * feature_num)
def train_test_split(X, y, test_size=0.25):
    permu = np.arange(X.shape[0])#np.random.permutation(X.shape[0])
    train_size = int(X.shape[0] * (1 - test_size) + 0.5)
    return X[permu[:train_size]], X[permu[train_size:]], y[permu[:train_size]], y[permu[train_size:]]

In [73]:
# Normalize them and split into train and validate
#X_train, X_mean, X_std = normalize(X_train)
#X_test, _, _ = normalize(X_test, is_train=False, X_mean=X_mean, X_std=X_std)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_ratio)

# Check shapes
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(48830, 508) (5426, 508) (48830,) (5426,)


In [92]:
print(X_train.shape, y_train.shape)

(54256, 508) (54256,)


In [93]:
# helper function for training
def shuffle(X, y):
    permu = np.random.permutation(X.shape[0])
    return (X[permu], y[permu])
def sigmoid(z):
    return np.clip(1 / (1.0 + np.exp(-z)), min_clip_value, max_clip_value)
def accuracy(y_pred, y_true):
    return 1.0 - np.mean(np.abs(y_pred - y_true))
def cross_entropy_loss(y_pred, y_true):
    return -np.dot(y_true, np.log(y_pred)) - np.dot((1 - y_true), np.log(1 - y_pred))

In [94]:
class LogisticRegression:
    def __init__(self):
        self.w = None
        self.b = None
    def f(self, X, w, b):
        return sigmoid(X @ w + b)
    def predict(self, X, w, b):
        return np.round(self.f(X, w, b)).astype(np.int)
    def predict_f(self, X):
        return np.round(self.f(X, self.w, self.b)).astype(np.int)
    def gradient(self, X, y, w, b):
        y_pred = self.f(X, w, b)
        y_diff = y - y_pred
        w_grad = -np.sum(y_diff * X.T, axis=1)
        b_grad = -np.sum(y_diff)
        return (w_grad, b_grad)
    def fit(self, X_train, y_train, X_valid=None, y_valid=None, epochs=10, batch_size=10, learning_rate=0.2, print_every=1):
        train_loss, valid_loss = [], []
        train_acc, valid_acc = [], []
        w, b = np.zeros((X_train.shape[1],)), np.zeros((1,))
        step, bst, bstf, bt = 1, 0, None, 0
        mw, mb = np.zeros((X_train.shape[1],)), np.zeros((1,))
        vw, vb = np.zeros((X_train.shape[1],)), np.zeros((1,))
        b1, b2, eps = 0.9, 0.999, 1e-9
        
        for epoch in range(epochs):
            X_train, y_train = shuffle(X_train, y_train)
            for i in range(0, X_train.shape[0], batch_size):
                X = X_train[i * batch_size:(i + 1) * batch_size]
                y = y_train[i * batch_size:(i + 1) * batch_size]
        
                w_grad, b_grad = self.gradient(X, y, w, b)
            
                #mw = mw * b1 + w_grad * (1 - b1)
                #vw = vw * b2 + w_grad ** 2 * (1 - b2)
                #mhat = mw / (1 - b1)
                #vhat = vw / (1 - b2)              
                #w -= learning_rate * mhat / (np.sqrt(vhat) + eps)
                #
                #mb = mb * b1 + b_grad * (1 - b1)
                #vb = vb * b2 + b_grad ** 2 * (1 - b2)
                #mhat = mb / (1 - b1)
                #vhat = vb / (1 - b2)              
                #b -= learning_rate * mhat / (np.sqrt(vhat) + eps)
                
                w -= learning_rate / np.sqrt(step) * w_grad
                b -= learning_rate / np.sqrt(step) * b_grad
                
                step += 1
        
            y_pred = self.f(X_train, w, b)
            train_loss.append(cross_entropy_loss(y_pred, y_train) / X_train.shape[0])
            train_acc.append(accuracy(np.round(y_pred), y_train))
            
            if X_valid is not None:
                y_pred = self.f(X_valid, w, b)
                valid_loss.append(cross_entropy_loss(y_pred, y_valid) / X_valid.shape[0])
                valid_acc.append(accuracy(np.round(y_pred), y_valid))
            
                if valid_acc[-1] > bst:
                    bst = valid_acc[-1]
                    bstf = (w, b)
                    bt = epoch

            if (epoch + 1) % print_every == 0:
                if X_valid is not None:
                    print(epoch, train_loss[-1], train_acc[-1], valid_loss[-1], valid_acc[-1])
                else:
                    print(epoch, train_loss[-1], train_acc[-1])
        
        if X_valid is not None:
            self.w = bstf[0]
            self.b = bstf[1]
            print(bt)
        else:
            self.w = w
            self.b = b
        return train_loss, train_acc, valid_loss, valid_acc
                

In [95]:
model = LogisticRegression()

In [81]:
train_loss, train_acc, valid_loss, valid_acc = model.fit(X_train, y_train, X_valid, y_valid, batch_size=10, epochs=60, print_every=1, learning_rate=0.5)

0 0.2906037606558233 0.875773090313332 0.2986431729177455 0.8730187983781792
1 0.28280621824039925 0.8778005324595536 0.2910976466490936 0.8741245853298931
2 0.28147344066456575 0.8783944296539012 0.29148359037259375 0.876889052709178
3 0.2797851759518624 0.8794593487610076 0.289593828772404 0.877441946185035
4 0.2796578944504862 0.8804833094409175 0.29030162670727305 0.875783265757464
5 0.2806168646906679 0.8787220970714724 0.2907748064517746 0.8730187983781792
6 0.27701800981108887 0.8798484538193734 0.2868627043280211 0.8763361592333211
7 0.2772403245433671 0.8801761212369446 0.2865012732319169 0.8763361592333211
8 0.2772882565826631 0.8800327667417571 0.2882013018206433 0.8750460744563214
9 0.2757492156325615 0.8809133729264796 0.2853691051625078 0.8783634353114633
10 0.27542193949303057 0.8805652262953103 0.2853670198557576 0.875783265757464
11 0.2747274795559469 0.8815277493344256 0.2858353179580257 0.8763361592333211
12 0.2738727235905559 0.8817530206840057 0.2846793697244961 0.

In [96]:
tl, ta, _, _ = model.fit(X_train, y_train, batch_size=10, epochs=60, print_every=1, learning_rate=0.5)
print(tl[-1], ta[-1])

0 0.3078077598611844 0.8700788852845769
1 0.29147566323790997 0.8749078442937187
2 0.2982283703622238 0.8721800353877912
3 0.28826844006747837 0.8752211736950752
4 0.281681877371279 0.8792207313476851
5 0.2818674735411991 0.8792023002064288
6 0.2798897268276171 0.8790917133588911
7 0.2795960488187585 0.8793681804777351
8 0.2784088810306598 0.8798842524329107
9 0.27920856912813663 0.8786862282512533
10 0.2772391559367418 0.8805846358006488
11 0.27753210351264185 0.880603066941905
12 0.27641930113092567 0.8804740489531112
13 0.2766603153037415 0.8797920967266293
14 0.275947272161964 0.8811007077558243
15 0.2761207237759734 0.8820959893836626
16 0.27563483790783294 0.8815983485697434
17 0.27953806058869446 0.8801607195517547
18 0.27494187671272097 0.8805477735181363
19 0.27493392972011055 0.8814508994396933
20 0.2744056983650432 0.8814140371571808
21 0.27395711901185493 0.8810454143320554
22 0.27398528371111097 0.8811744323208494
23 0.27455585663926313 0.8818748156885874
24 0.273638354524

In [97]:
y_test_pred = pd.DataFrame(model.predict_f(X_test).transpose())
y_test_pred.columns = ['label']
y_test_pred['id'] = range(0, X_test.shape[0])
y_test_pred = y_test_pred.reindex(columns=['id', 'label'])
y_test_pred.to_csv(y_test_fpath.format('logistic'), index=None)