In [1]:
import numpy as np
import pandas as pd

In [2]:
# hyperparams
my_seed = 880301
valid_ratio = 0.1
eps = 1e-8
min_clip_value = 1e-7
max_clip_value = 1 - 1e-7
X_train_fpath = './data/X_train'
y_train_fpath = './data/Y_train'
X_test_fpath = './data/X_test'
y_test_fpath = './output_{}.csv'

In [3]:
# set numpy seed
np.random.seed(my_seed)

In [4]:
# read datasets
X_train = pd.read_csv(X_train_fpath, index_col=['id']).to_numpy().astype('float64')
X_test = pd.read_csv(X_test_fpath, index_col=['id']).to_numpy().astype('float64')
y_train = pd.read_csv(y_train_fpath, index_col=['id']).to_numpy().astype('float64').flatten()

In [5]:
# define some helper function as we can't use sklearn...
def normalize(X, is_train=True, columns=None, X_mean=None, X_std=None):
    if columns is None:
        columns = np.arange(X.shape[1])
    if is_train:
        X_mean = np.mean(X[:, columns], axis=0).reshape(1, -1)
        X_std = np.std(X[:, columns], axis=0).reshape(1, -1)
    X[:, columns] = (X[:, columns] - X_mean) / (X_std + eps)
    return X, X_mean, X_std # X_mean, X_std: (1 * feature_num)
def train_test_split(X, y, test_size=0.25):
    permu = np.arange(X.shape[0])#np.random.permutation(X.shape[0])
    train_size = int(X.shape[0] * (1 - test_size) + 0.5)
    return X[permu[:train_size]], X[permu[train_size:]], y[permu[:train_size]], y[permu[train_size:]]

In [6]:
# Normalize them and split into train and validate
X_train, X_mean, X_std = normalize(X_train)
X_test, _, _ = normalize(X_test, is_train=False, X_mean=X_mean, X_std=X_std)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_ratio)

# Check shapes
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(48830, 510) (5426, 510) (48830,) (5426,)


In [7]:
# helper function for training
def shuffle(X, y):
    permu = np.random.permutation(X.shape[0])
    return (X[permu], y[permu])
def sigmoid(z):
    return np.clip(1 / (1.0 + np.exp(-z)), min_clip_value, max_clip_value)
def accuracy(y_pred, y_true):
    return 1.0 - np.mean(np.abs(y_pred - y_true))
def cross_entropy_loss(y_pred, y_true):
    return -np.dot(y_true, np.log(y_pred)) - np.dot((1 - y_true), np.log(1 - y_pred))

In [8]:
class LogisticRegression:
    def __init__(self):
        self.w = None
        self.b = None
    def f(self, X, w, b):
        return sigmoid(X @ w + b)
    def predict(self, X, w, b):
        return np.round(self.f(X, w, b)).astype(np.int)
    def predict_f(self, X):
        return np.round(self.f(X, self.w, self.b)).astype(np.int)
    def gradient(self, X, y, w, b):
        y_pred = self.f(X, w, b)
        y_diff = y - y_pred
        w_grad = -np.sum(y_diff * X.T, axis=1)
        b_grad = -np.sum(y_diff)
        return (w_grad, b_grad)
    def fit(self, X_train, y_train, X_valid, y_valid, epochs=10, batch_size=10, learning_rate=0.2, print_every=1):
        train_loss, valid_loss = [], []
        train_acc, valid_acc = [], []
        w, b = np.zeros((X_train.shape[1],)), np.zeros((1,))
        step, bst, bstf, bt = 1, 0, None, 0
        mw, mb = np.zeros((X_train.shape[1],)), np.zeros((1,))
        vw, vb = np.zeros((X_train.shape[1],)), np.zeros((1,))
        b1, b2, eps = 0.9, 0.999, 1e-9
        
        for epoch in range(epochs):
            X_train, y_train = shuffle(X_train, y_train)
            for i in range(0, X_train.shape[0], batch_size):
                X = X_train[i * batch_size:(i + 1) * batch_size]
                y = y_train[i * batch_size:(i + 1) * batch_size]
        
                w_grad, b_grad = self.gradient(X, y, w, b)
            
                #mw = mw * b1 + w_grad * (1 - b1)
                #vw = vw * b2 + w_grad ** 2 * (1 - b2)
                #mhat = mw / (1 - b1)
                #vhat = vw / (1 - b2)              
                #w -= learning_rate * mhat / (np.sqrt(vhat) + eps)
                #
                #mb = mb * b1 + b_grad * (1 - b1)
                #vb = vb * b2 + b_grad ** 2 * (1 - b2)
                #mhat = mb / (1 - b1)
                #vhat = vb / (1 - b2)              
                #b -= learning_rate * mhat / (np.sqrt(vhat) + eps)
                
                w -= learning_rate / np.sqrt(step) * w_grad
                b -= learning_rate / np.sqrt(step) * b_grad
                
                step += 1
        
            y_pred = self.f(X_train, w, b)
            train_loss.append(cross_entropy_loss(y_pred, y_train) / X_train.shape[0])
            train_acc.append(accuracy(np.round(y_pred), y_train))
            
            y_pred = self.f(X_valid, w, b)
            valid_loss.append(cross_entropy_loss(y_pred, y_valid) / X_valid.shape[0])
            valid_acc.append(accuracy(np.round(y_pred), y_valid))
            
            
            if valid_acc[-1] > bst:
                bst = valid_acc[-1]
                bstf = (w, b)
                bt = epoch

            if (epoch + 1) % print_every == 0:
                print(epoch, train_loss[-1], train_acc[-1], valid_loss[-1], valid_acc[-1])
        
        self.w = bstf[0]
        self.b = bstf[1]
        print(bt)
        return train_loss, train_acc, valid_loss, valid_acc
                

In [9]:
model = LogisticRegression()

In [10]:
train_loss, train_acc, valid_loss, valid_acc = model.fit(X_train, y_train, X_valid, y_valid, epochs=100, batch_size=5, print_every=1, learning_rate=0.5)

  


0 0.7331928906198477 0.8599836166291215 0.7823241920909041 0.8514559528197567
1 0.5685348541613852 0.8685234486995699 0.6302942632377245 0.8593807593070402
2 0.5138805449951698 0.8719844357976654 0.5741013844481375 0.8639882049391817
3 0.47569254750922924 0.8705918492729879 0.5185038191588905 0.86490969406561
4 0.453281029366943 0.8731107925455662 0.4900789016452888 0.866015481017324
5 0.43514380477758535 0.8733565431087447 0.46596860240325666 0.8682270549207519
6 0.4131427945721683 0.8744828998566455 0.44717803294307956 0.8708072244747512
7 0.39748472102678767 0.8763260290804833 0.42604508219779524 0.8706229266494655
8 0.3881986243818092 0.8762031537988941 0.41857741842812207 0.8715444157758938
9 0.37385266391337413 0.877329510546795 0.4084374931303828 0.8724659049023221
10 0.3689814631679963 0.8783534712267049 0.39898214506426494 0.8719130114264652
11 0.36153854628551546 0.8763669875076797 0.39198043111945546 0.8706229266494655
12 0.3559967558533965 0.8779643661683392 0.3879161951737

In [11]:
y_test_pred = pd.DataFrame(model.predict_f(X_test).transpose())
y_test_pred.columns = ['label']
y_test_pred['id'] = range(0, X_test.shape[0])
y_test_pred = y_test_pred.reindex(columns=['id', 'label'])
y_test_pred.to_csv(y_test_fpath.format('logistic'), index=None)

  
