In [4]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold
import pylab as pl
from matplotlib import collections  as mc
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
df_data_1 = pd.read_csv("../common/cleaned.csv", index_col="Unnamed: 0")
df_data_1.head()

Unnamed: 0,LIMIT_BAL,BILL_AMT_DEC,BILL_AMT_NOV,BILL_AMT_OCT,BILL_AMT_SEP,BILL_AMT_AUG,BILL_AMT_JUL,PAY_AMT_DEC,PAY_AMT_NOV,PAY_AMT_OCT,...,PAY_JUL_-2,PAY_JUL_-1,PAY_JUL_0,PAY_JUL_2,PAY_JUL_3,PAY_JUL_4,PAY_JUL_5,PAY_JUL_6,PAY_JUL_7,PAY_JUL_8
53613503,1490.68,1494.853904,462.677258,581.961472,629.156401,599.283174,608.257067,89.4408,149.068,68.57128,...,0,0,1,0,0,0,0,0,0,0
35524089,5664.584,2946.00107,3046.174766,3137.285128,533.872135,-2.772665,-14.399969,149.068,159.83071,29.8136,...,1,0,0,0,0,0,0,0,0,0
58777007,6260.856,1761.119166,1753.934088,10.85215,-19.349026,-19.349026,-19.349026,59.6272,532.59015,16.963938,...,1,0,0,0,0,0,0,0,0,0
26913418,8944.08,88.635833,80.317838,216.744872,88.546392,70.330282,253.177091,80.556347,217.22189,88.784901,...,1,0,0,0,0,0,0,0,0,0
41993633,3577.632,29.217328,119.2544,29.8136,0.0,569.43976,0.0,119.2544,29.8136,0.0,...,0,1,0,0,0,0,0,0,0,0


In [14]:
# Given a set of model parameters (learning rate, regularization penalty coefficient), find the optimal parameters with cross validation

def cross_validation(set_of_learning_rates, set_of_regs, X_t, y_t):
    n_folds = 3
    skf = StratifiedKFold(y_t, n_folds=n_folds) # Get train-validation set stratified (keeps the same distribution) splitter
    set_of_params = [(x,y) for x in set_of_learning_rates for y in set_of_regs]
    
    print("Number of folds: " + str(n_folds))
    print("Number of parameters combinations: " + str(len(set_of_params)))

    accuracy = np.zeros(len(set_of_params))

    # Repeat for every train-validate combination 
    for cv_train_index, cv_val_index in skf:
        X_cv_train, X_cv_val = X_t[cv_train_index], X_t[cv_val_index]
        y_cv_train, y_cv_val = y_t[cv_train_index], y_t[cv_val_index]

        for param_idx, param in enumerate(set_of_params):
            print("Combination " + str(param_idx) + ": Parameters: " +  str(param))
            print("Combination " + str(param_idx) + ": Train model" )
            W, b, W2, b2 = train_nn(X_cv_train, y_cv_train, param[0], param[1]) # Train
            print("Combination " + str(param_idx) + ": Predict") 
            _, accuracy_cv = predict(X_cv_val, y_cv_val, W, b, W2, b2) # Predict for valiation set
            accuracy[param_idx] = accuracy[param_idx] + accuracy_cv # Calculate error (cross entropy)

    accuracy = accuracy / n_folds # Calculate average error
    

    print("BEST LEARNING RATE (ACCURACY) = ", set_of_params[np.argmax(accuracy)][1])
    print("BEST REG (ACCURACY) = ", set_of_params[np.argmax(accuracy)][0])
    
    # Plot the results
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    Y = map(lambda x: x[0], set_of_params)
    X = map(lambda x: x[1], set_of_params)
    for i in np.arange(len(X)):
        ax.scatter(X[i], Y[i], accuracy[i], color = "r")
        ax.plot_surface(X[i], Y[i], accuracy[i], color = "b")
    ax.set_xlabel('alpha')
    ax.set_ylabel('L1')
    ax.set_zlabel('R2')
    plt.show()
    
    return set_of_params[np.argmax(accuracy)]

In [15]:
# Classify the X input set using 2-layer network with weight and bias parameters for each layer as parameters (W,b and W2, b2)
def predict(X, y, W, b, W2, b2):
    hidden_layer = np.maximum(0, np.dot(X, W) + b)
    scores = np.dot(hidden_layer, W2) + b2
    predicted_class = np.argmax(scores, axis=1)
    print(' > Prediction ACCURACY: %.2f' % (np.mean(predicted_class == y)))
    return predicted_class, np.mean(predicted_class == y)

In [16]:
# Train the 2-layer network (X_i input features, y_i input labels)
def train_nn(X_i, y_i, learning_rate, reg):
    
    # Parameter initilization
    W = 0.01 * np.random.randn(D,h)
    b = np.zeros((1,h))
    W2 = 0.01 * np.random.randn(h,K)
    b2 = np.zeros((1,K))
    num_examples = X_i.shape[0]
    
    # Epochs
    for i in xrange(50000):

        hidden_layer = np.maximum(0, np.dot(X_i, W) + b)
        scores = np.dot(hidden_layer, W2) + b2 # Logits

        exp_scores = np.exp(scores)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # Turn logits into probabilities

        corect_logprobs = -np.log(probs[range(num_examples),y_i])
        data_loss = np.sum(corect_logprobs)/num_examples
        reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2) # Apply regularization
        loss = data_loss + reg_loss # Calculate loss

        # Backpropagation
        dscores = probs
        dscores[range(num_examples),y_i] -= 1
        dscores /= num_examples

        
        dW2 = np.dot(hidden_layer.T, dscores)
        db2 = np.sum(dscores, axis=0, keepdims=True)

        dhidden = np.dot(dscores, W2.T)

        dhidden[hidden_layer <= 0] = 0

        dW = np.dot(X_i.T, dhidden)
        db = np.sum(dhidden, axis=0, keepdims=True)

        dW2 += reg * W2
        dW += reg * W

        # Update parameters
        W += - learning_rate * dW
        b += - learning_rate * db
        W2 += - learning_rate * dW2
        b2 += - learning_rate * db2

    print("iteration %d: loss %f" % (i, loss))
    return W, b, W2, b2

In [17]:
# Calculate the final score for the model using test set
def score_model(X_tr, Y_tr, X_ts, Y_ts, best_learning_rate, best_reg):
    print("BEST LEARNING RATE (ACCURACY) = " + str(best_learning_rate))
    print("BEST REG (ACCURACY) = " + str(best_reg))

    # Train the model with the best hyperparameters
    W, b, W2, b2 = train_nn(X_tr, Y_tr, best_learning_rate, best_reg) 
    
    # Predict test data
    predicted_class, cross_entropy_test = predict(X_ts, Y_ts, W, b, W2, b2)
    print("ACCURACY = " +str(cross_entropy_test))
    print(predicted_class)
    print(Y_ts)
    
    # Plot the results
    plt.scatter(np.arange(0, len(Y_ts)), predicted_class, color='b')
    plt.scatter(np.arange(0, len(Y_ts)), Y_ts, color='r')

    lines = []
    ax = plt.axes()

    for i in range(predicted_class.shape[0]):
        lines.append([(i, predicted_class[i]), (i, Y_ts[i])])

    lc = mc.LineCollection(lines, colors="g", linewidths=1)

    ax.add_collection(lc)
    ax.autoscale()
    ax.margins(0.1)
    plt.grid(True)

    plt.show()

In [21]:
# Prepare the dataset & configurations for the network
X = df_data_1.drop("DEFAULT PAYMENT JAN",1).values
D =  X.shape[1]
num_examples = X.shape[0]
K = len(df_data_1["DEFAULT PAYMENT JAN"].unique())
y = df_data_1["DEFAULT PAYMENT JAN"].values

step_size = 1e-04
reg = 1e-03

h = 50 # size of hidden layer

In [22]:
print("Input data shape: ",X.shape)
print("Target data shape",y.shape)
print("Number of features: ",D)
print("Number of classes: ",K)
print("Number of examples: ",num_examples)
print("Number of hidden units: ",h)

Input data shape:  (26884, 87)
Target data shape (26884,)
Number of features:  87
Number of classes:  2
Number of examples:  26884
Number of hidden units:  50


In [24]:
# Split the train and test datasets

from sklearn.cross_validation import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(y, 1, test_size=.20, random_state=0)

for train_index, test_index in sss:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    print("Train shape: " + str(X_train.shape))
    print("Test shape: " + str(X_test.shape))

Train shape: (21507, 87)
Test shape: (5377, 87)


In [None]:
# Evaluate the different parameters for the model & plot

set_of_learning_rates = np.arange(1e-2, 3e-2, step=1e-2)
set_of_regs = np.arange(1e-2, 3e-2, step=1e-2)

best_params = cross_validation(set_of_learning_rates, set_of_regs, X_train, y_train)

best_learning_rate = best_params[1]
best_reg = best_params[0]

In [None]:
# Calculate the final score for the model
score_model(X_train, y_train, X_test, y_test, best_learning_rate, best_reg)