In [1]:
import numpy as np
import pandas as pd

In [2]:
# hyperparams
my_seed = 881003
valid_ratio = 0.1
min_clip_value = 1e-20
max_clip_value = 1 - 1e-20
X_train_fpath = './data/X_train'
y_train_fpath = './data/Y_train'
X_test_fpath = './data/X_test'
y_test_fpath = './output_{}.csv'

In [3]:
# set numpy seed
np.random.seed(my_seed)

In [4]:
# read datasets
X_train = pd.read_csv(X_train_fpath, index_col=['id'])
X_test = pd.read_csv(X_test_fpath, index_col=['id'])
y_train = pd.read_csv(y_train_fpath, index_col=['id']).to_numpy().astype('float64').flatten()

In [5]:
def train_test_split(X, y, test_size=0.25):
    permu = np.arange(X.shape[0])#np.random.permutation(X.shape[0])
    train_size = int(X.shape[0] * (1 - test_size) + 0.5)
    return X[permu[:train_size]], X[permu[train_size:]], y[permu[:train_size]], y[permu[train_size:]]

In [6]:
# preprocess
train_size = X_train.shape[0]
X = pd.concat([X_train, X_test])
count = X.nunique()
one = count[count == 1].index
two = count[count == 2].index
three_or_more = count[count >= 3].index
#X = X.drop(columns=one)
#X[three_or_more] = (X[three_or_more] - X[three_or_more].mean()) / X[three_or_more].std()
#X = (X - X.mean()) / X.std()

X_train, X_test = X.iloc[:train_size, :], X.iloc[train_size:, :]
X_train = X_train.to_numpy().astype('float64')
X_test = X_test.to_numpy().astype('float64')
#X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=valid_ratio)

In [7]:
print(X_train.shape, y_train.shape, X_test.shape)

(54256, 510) (54256,) (27622, 510)


In [8]:
# helper function for training
def shuffle(X, y):
    permu = np.random.permutation(X.shape[0])
    return (X[permu], y[permu])
def sigmoid(z):
    return np.clip(1 / (1.0 + np.exp(-z)), min_clip_value, max_clip_value)
def accuracy(y_pred, y_true):
    return 1.0 - np.mean(np.abs(y_pred - y_true))
def cross_entropy_loss(y_pred, y_true):
    return -np.dot(y_true, np.log(y_pred)) - np.dot((1 - y_true), np.log(1 - y_pred))

In [9]:
class GenerativeModel:
    def __init__(self):
        self.w = None
        self.b = None
    def f(self, X, w, b):
        return sigmoid(X @ w + b)
    def predict(self, X, w, b):
        return np.round(self.f(X, w, b)).astype(np.int)
    def predict_f(self, X):
        return np.round(self.f(X, self.w, self.b)).astype(np.int)
    def fit(self, X_train, y_train, X_valid=None, y_Valid=None):
        # Compute in-class mean
        X_train_0 = np.array([x for x, y in zip(X_train, y_train) if y == 0])
        X_train_1 = np.array([x for x, y in zip(X_train, y_train) if y == 1])

        mean_0 = np.mean(X_train_0, axis = 0)
        mean_1 = np.mean(X_train_1, axis = 0)  

        # Compute in-class covariance
        cov_0 = np.zeros((X_train.shape[1], X_train.shape[1]))
        cov_1 = np.zeros((X_train.shape[1], X_train.shape[1]))

        for x in X_train_0:
            cov_0 += np.dot(np.transpose([x - mean_0]), [x - mean_0]) / X_train_0.shape[0]
        for x in X_train_1:
            cov_1 += np.dot(np.transpose([x - mean_1]), [x - mean_1]) / X_train_1.shape[0]

        # Shared covariance is taken as a weighted average of individual in-class covariance.
        cov = (cov_0 * X_train_0.shape[0] + cov_1 * X_train_1.shape[0]) / (X_train_0.shape[0] + X_train_1.shape[0])
    
        # Compute inverse of covariance matrix.
        # Since covariance matrix may be nearly singular, np.linalg.inv() may give a large numerical error.
        # Via SVD decomposition, one can get matrix inverse efficiently and accurately.
        u, s, v = np.linalg.svd(cov, full_matrices=False)
        inv = np.matmul(v.T * 1 / s, u.T)

        # Directly compute weights and bias
        w = np.dot(inv, mean_0 - mean_1)
        b =  (-0.5) * np.dot(mean_0, np.dot(inv, mean_0)) + 0.5 * np.dot(mean_1, np.dot(inv, mean_1))\
            + np.log(float(X_train_0.shape[0]) / X_train_1.shape[0]) 

        # Compute accuracy on training set
        y_train_pred = 1 - self.predict(X_train, w, b)
        print('Training accuracy: {}'.format(accuracy(y_train_pred, y_train)))
        if X_valid is not None:
            y_valid_pred = 1 - self.predict(X_valid, w, b)
            print('Validate accuracy: {}'.format(accuracy(y_valid_pred, y_valid)))
        self.w = w
        self.b = b

In [10]:
model = GenerativeModel()

In [11]:
# Validate accuracy: 0.8665683744931809 valid_ratio=0.1 881003
model.fit(X_train, y_train)

Training accuracy: 0.8712400471837216


In [12]:
y_test_pred = pd.DataFrame(1 - model.predict_f(X_test).transpose())
y_test_pred.columns = ['label']
y_test_pred['id'] = range(0, X_test.shape[0])
y_test_pred = y_test_pred.reindex(columns=['id', 'label'])
y_test_pred.to_csv(y_test_fpath.format('generative'), index=None)