In [11]:
import numpy as np
import pandas as pd
import sys
import math
dim =106

In [12]:
def load_data():
    x_train = pd.read_csv('X_train')
    x_test = pd.read_csv('X_test')

    x_train = x_train.values
    x_test = x_test.values

    y_train = pd.read_csv('Y_train', header = None)
    y_train = y_train.values
    y_train = y_train.reshape(-1)

    return x_train, y_train, x_test

def sigmoid(z):
    res = 1 / (1.0 + np.exp(-z))
    return np.clip(res, 1e-6, 1-1e-6)

In [35]:
def train(x_train, y_train):
    cnt1 = 0
    cnt2 = 0
    
    mu1 = np.zeros((dim,))
    mu2 = np.zeros((dim,))
    
    for i in range(x_train.shape[0]):
        if y_train[i] == 1:
            cnt1 += 1
            mu1 += x_train[i]
        else:
            cnt2 += 1
            mu2 += x_train[i]
    mu1 /= cnt1
    mu2 /= cnt2

    sigma1 = np.zeros((dim,dim))
    sigma2 = np.zeros((dim,dim))
    for i in range(x_train.shape[0]):
        if y_train[i] == 1:
            sigma1 += np.dot(np.transpose([x_train[i] - mu1]), [(x_train[i] - mu1)])
        else:
            sigma2 += np.dot(np.transpose([x_train[i] - mu2]), [(x_train[i] - mu2)])
    sigma1 /= cnt1
    sigma2 /= cnt2

    
    share_sigma = (cnt1 / x_train.shape[0]) * sigma1 + (cnt2 / x_train.shape[0]) * sigma2
    return mu1, mu2, share_sigma, cnt1, cnt2

In [36]:
def predict(x_test, mu1, mu2, share_sigma, N1, N2):
    sigma_inverse = np.linalg.inv(share_sigma)
    print(sigma_inverse)
    w = np.dot( (mu1-mu2), sigma_inverse)
    #print("w=",w)
    b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inverse), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inverse), mu2) + np.log(float(N1)/N2)

    z = np.dot(w, x_test.T) + b
    pred = sigmoid(z)
    return pred

In [37]:
if __name__ == '__main__':
    x_train,y_train,x_test = load_data()
    
    mu1, mu2, shared_sigma, N1, N2 = train(x_train, y_train)

    
    y = predict(x_test, mu1, mu2, shared_sigma, N1, N2)
    
    y = (np.around(y))
    y = np.array(y,dtype = int)
    
    with open('TAcode_generative_model.csv', 'w') as f:
        print('id,label', file=f)
        for (i, p) in enumerate(y) :
            print('{},{}'.format(i+1, p), file=f)
    
    #predict x_test

[[ 9.42393826e-03  5.49352286e-08 -3.23540064e-03 ...  1.16008222e-01
   1.16826759e-01  1.12250880e-01]
 [ 5.49352286e-08  9.53007087e-11 -1.20713754e-06 ... -2.16024337e-05
  -2.30136203e-05 -2.06118204e-05]
 [-3.23540064e-03 -1.20713754e-06  9.12072201e+00 ... -8.83955987e+00
  -8.23395892e+00 -8.86549864e+00]
 ...
 [ 1.18368886e-01 -2.15809713e-05 -8.83206079e+00 ... -3.27336907e+14
  -3.27336907e+14 -3.27336907e+14]
 [ 1.15906143e-01 -2.30194325e-05 -8.20160225e+00 ... -3.27336907e+14
  -3.27336907e+14 -3.27336907e+14]
 [ 1.11840580e-01 -2.06161798e-05 -8.83035809e+00 ... -3.27336907e+14
  -3.27336907e+14 -3.27336907e+14]]
