# 0 - Import packages:

In [1]:
import pandas as pd
import numpy as np
import pylab as pl
from matplotlib import collections  as mc
from sklearn.model_selection import StratifiedKFold
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import f1_score

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 5.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

# 1 - Load data:

## 1.1 - Train set: 

In [2]:
df_data_1 = pd.read_csv("../common/albertom/train_test_val_split/X_train_pca.csv", index_col="Unnamed: 0")
# df_data_1 = df_data_1.head(500)
print(df_data_1.shape)
df_data_1.head()

(19423, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-18.3417,-1.499321,-1.355711,4.085164,0.507722,1.323943,-1.158738,3.239028,-0.351744,4.684859,-1.288307,-0.614557,-1.519203,1.271893,0.056804
1,19.524442,-2.030392,-0.674545,-3.82956,0.021374,-3.27542,1.350335,1.898675,0.694699,-2.756291,-0.794836,1.239952,1.98759,1.35163,0.320299
2,0.87707,-5.641901,3.048596,1.115479,0.016561,0.644258,-0.395762,0.024898,0.223658,0.744945,0.004955,-0.120828,-0.338187,0.931008,0.728083
3,-5.824436,-7.146028,2.308765,1.485002,1.753751,0.034794,0.535366,-0.180122,1.468998,2.60455,0.730246,-0.127975,0.202254,0.150643,0.239629
4,-5.021571,4.583527,-3.234845,2.983661,-2.991689,-1.813864,1.493669,-2.691452,-2.037845,2.800191,-1.10804,0.335971,-0.956513,0.223005,0.470329


## 1.2 - Train labels:

In [3]:
df_target_data_1 = pd.read_csv("../common/albertom/train_test_val_split/y_train.csv", index_col="Unnamed: 0")
# df_target_data_1 = df_target_data_1.head(500)
print(df_target_data_1.shape)
df_target_data_1.head()

(19423, 2)


Unnamed: 0,0,1
0,0,18329863
1,1,51718313
2,1,32674991
3,1,46137138
4,0,15314372


## 1.3 - Test set:

In [4]:
df_data_2 = pd.read_csv("../common/albertom/train_test_val_split/X_test_pca.csv", index_col="Unnamed: 0")
# df_data_2 = df_data_2.head(100)
print(df_data_2.shape)
df_data_2.head()

(4033, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-1.71819,-6.316477,-2.083559,0.975329,-0.296123,1.974713,0.379124,-0.007346,1.95861,-1.165543,0.759262,-0.382965,-0.584148,1.553133,0.568045
1,0.020365,-0.602705,9.139554,0.840974,0.546386,-5.230693,-0.084946,-10.231455,-0.852961,0.45118,0.669972,-0.829123,1.02423,0.229725,0.039168
2,9.975737,-3.348945,-0.572407,0.087099,0.788437,0.297061,0.889188,0.581228,-0.122152,-0.471265,0.488114,-0.081485,-0.172004,-0.577762,-0.305729
3,-22.105474,0.258111,0.720882,-3.436382,-1.524025,0.087214,-0.444141,0.608868,1.049485,0.409755,-0.329483,-0.048181,-0.009138,-0.52719,1.436177
4,-14.011642,2.771666,-3.670775,-0.453639,-0.30395,-1.095188,-2.580395,-1.656323,-1.11522,0.806741,-0.176124,0.27589,-0.78406,1.083728,-1.022576


## 1.4 - Test labels:

In [5]:
df_target_data_2 = pd.read_csv("../common/albertom/train_test_val_split/y_test.csv", index_col="Unnamed: 0")
# df_target_data_2 = df_target_data_2.head(100)
print(df_target_data_2.shape)
df_target_data_2.head()

(4033, 2)


Unnamed: 0,0,1
0,1,10930294
1,1,55016784
2,0,48129536
3,1,37805023
4,0,22798417


## 1.5 - Val set:

In [6]:
df_data_3 = pd.read_csv("../common/albertom/train_test_val_split/X_val_pca.csv", index_col="Unnamed: 0")
# df_data_3 = df_data_3.head(100)
print(df_data_3.shape)
df_data_3.head()

(3428, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.121868,-4.745601,-3.924026,2.321226,1.560214,0.136213,0.05843,-0.207562,0.129703,0.631257,-0.073079,-0.072595,-0.046537,-1.095496,0.02065
1,-18.892884,1.320438,2.465213,-0.32819,-2.543895,0.728472,-1.207503,1.209443,0.543729,0.205445,0.68,-0.282376,-0.080303,0.704908,0.989034
2,8.41348,-3.967846,-1.918874,0.249799,0.485784,0.726944,-0.507952,-0.138389,0.109414,0.208978,-0.092068,0.34134,-0.37666,-0.591348,0.472888
3,-10.221737,-0.903182,-1.343646,1.040087,2.489991,2.670366,-0.744474,2.248467,-0.269105,-0.410593,-1.574585,0.339117,-0.976271,0.889209,1.044498
4,-1.60157,0.028557,-10.087533,2.789941,-7.033029,-5.960131,0.69024,-2.786239,2.339409,5.522192,1.373146,0.145313,0.697186,0.479144,1.207805


## 1.6 - Val labels:

In [7]:
df_target_data_3 = pd.read_csv("../common/albertom/train_test_val_split/y_val.csv", index_col="Unnamed: 0")
# df_target_data_3 = df_target_data_3.head(100)
print(df_target_data_3.shape)
df_target_data_3.head()

(3428, 2)


Unnamed: 0,0,1
0,0,37382735
1,0,16291467
2,0,16215767
3,0,39563885
4,0,50610721


# 2 - Globals:

## 2.1 - Parameters 

In [8]:
n_folds = 3
num_steps = 3001

valid_size = df_data_3.shape[0]
test_size = df_data_2.shape[0]
batch_size = 128

distinct_labels = len(df_target_data_1["0"].unique())

train_dataset = df_data_1.values
val_dataset = df_data_3.values
test_dataset = df_data_2.values

split_labels = df_target_data_1["0"].values
train_labels = (np.arange(distinct_labels) == split_labels[:,None]).astype(np.float32)

test_labels = df_target_data_2["0"].values
test_labels = (np.arange(distinct_labels) == test_labels[:,None]).astype(np.float32)

val_labels = df_target_data_3["0"].values
val_labels = (np.arange(distinct_labels) == val_labels[:,None]).astype(np.float32)

num_features = train_dataset.shape[1]
num_examples = train_dataset.shape[0]

## 2.2 - Tensorflow graph: 

In [9]:
graph = tf.Graph()
with graph.as_default():

    # Input data.
    X = tf.placeholder(tf.float32, shape=(batch_size, num_features))
    t = tf.placeholder(tf.int32, shape=(batch_size, distinct_labels))

    L2_reg = tf.placeholder(tf.float32, shape=[])
    learning_rate = tf.placeholder(tf.float32, shape=[])
    
    X_val = tf.constant(val_dataset, tf.float32)
    X_test = tf.constant(test_dataset, tf.float32)

    # Variables.
    num_hidden1 = 10
    num_hidden2 = 7
    
    W1 = tf.Variable(tf.truncated_normal([num_features, num_hidden1]) )
    b1 = tf.Variable(tf.zeros([num_hidden1]))

    W2 = tf.Variable(tf.truncated_normal([num_hidden1, distinct_labels]))
    b2 = tf.Variable(tf.zeros([distinct_labels]))
    
    # Training.
    h1 = tf.nn.relu(tf.matmul(X, W1) + b1)
    logits = tf.matmul(h1, W2) + b2
    
    # Loss NO reg.
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, t))
    regularization = (tf.nn.l2_loss(W1) + tf.nn.l2_loss(b1) + tf.nn.l2_loss(W2) + tf.nn.l2_loss(b2))
    loss = loss + L2_reg * regularization

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

    # Predictions for training, validation.
    train_predictions = tf.nn.softmax(logits)
    val_predictions = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(X_val, W1) + b1), W2) + b2)
    test_predictions = tf.nn.softmax(tf.matmul(tf.nn.relu(tf.matmul(X_test, W1) + b1), W2) + b2)

## 2.3 - Functions:

In [10]:
# Given a set of model parameters (learning rate, regularization penalty coefficient), find the optimal parameters with cross validation

def holdout_validation(set_of_learning_rates, set_of_regs, X_train, y_train):
    
    # Get train-validation set stratified (keeps the same distribution) splitter
    #skf = StratifiedKFold(n_splits=3)
    set_of_params = [(x,y) for x in set_of_learning_rates for y in set_of_regs]
    
    #print("Number of folds: " + str(n_folds))
    print("Number of parameters combinations: " + str(len(set_of_params)))
    print("We will train " + str(len(set_of_params)) + " neural networks for this cv task.")
    
    f1 = np.zeros(len(set_of_params))

    for param_idx, param in enumerate(set_of_params):

        print(str(param_idx) + ") Combination: [Parameters: " +  str(param) + " | Train model]")
        with tf.Session(graph=graph) as session:
            tf.global_variables_initializer().run()
                
            for step in np.arange(num_steps):

                offset = (step * batch_size) % (num_examples - batch_size)
                X_batch = X_train[offset:(offset + batch_size), :]
                t_batch = y_train[offset:(offset + batch_size)]
                feed_dict = {
                    X : X_batch,
                    t : t_batch,
                    L2_reg : param[1],
                    learning_rate : param[0]
                }
                _, l, pred_batch = session.run( [optimizer, loss, train_predictions], feed_dict=feed_dict)
                
                if (step % 500 == 0):
                    print("> Minibatch loss at step %d: %f" % (step, l))
                    print("> Training f1: %.1f%%" % score(pred_batch, t_batch))
                    print("> Validation f1: %.1f%%" % score(val_predictions.eval(), val_labels))
                
            f1_cv = score(val_predictions.eval(), val_labels)
            print("|> Final Validation f1: %.1f%%" % f1_cv)
            f1[param_idx] = f1[param_idx] + f1_cv

    f1 = f1 / n_folds # Calculate average error
    

    print(">>>>>> Best [learning rate - f1] = ", set_of_params[np.argmax(f1)][1])
    print(">>>>>> BEST [reg - f1] = ", set_of_params[np.argmax(f1)][0])

    return set_of_params[np.argmax(f1)]

In [11]:
def score(predictions, labels):
    return f1_score(np.argmax(predictions, 1), np.argmax(labels, 1))

# 3 - Training: 

## 3.1 - Cross validation for optimal hyperparameters:

In [None]:
set_of_learning_rates = np.arange(0.01, 0.03, step=0.01)
set_of_L2_regs = np.arange(0.01, 0.03, step=0.01)

print("Holdout validation on TRAINING-SET for [learning rate, L2_reg] ...")

best_params = cross_validation(set_of_learning_rates, set_of_L2_regs, train_dataset, train_labels)

best_learning_rate = best_params[0]
best_L2_reg = best_params[1]

print("Best parameters: [" + str(best_learning_rate) + ", " + str(best_L2_reg) + "] ...")

Holdout validation on TRAINING-SET for [learning rate, L2_reg] ...
Number of parameters combinations: 4
We will train 4 neural networks for this cv task.
0) Combination: [Parameters: (0.01, 0.01) | Train model]
> Minibatch loss at step 0: 4.703604
> Training f1: 0.4%
> Validation f1: 0.3%
> Minibatch loss at step 500: 1.082022
> Training f1: 0.3%
> Validation f1: 0.0%
> Minibatch loss at step 1000: 0.945582
> Training f1: 0.1%
> Validation f1: 0.0%
> Minibatch loss at step 1500: 0.941324
> Training f1: 0.1%
> Validation f1: 0.1%
> Minibatch loss at step 2000: 0.924558
> Training f1: 0.1%
> Validation f1: 0.0%
> Minibatch loss at step 2500: 0.870820
> Training f1: 0.2%
> Validation f1: 0.1%
> Minibatch loss at step 3000: 0.851166
> Training f1: 0.1%
> Validation f1: 0.1%
|> Final Validation f1: 0.1%
1) Combination: [Parameters: (0.01, 0.02) | Train model]
> Minibatch loss at step 0: 19.101748
> Training f1: 0.4%
> Validation f1: 0.3%
> Minibatch loss at step 500: 1.559120
> Training f1:

## 3.2 - Prediction using optimal hyperparameters:

In [None]:
print("Prediction for TEST-SET using best parameters ...")

with tf.Session(graph=graph) as session:
    
    print("Combination: [Parameters: " +  str(lr) + ", " + str(L2_reg) + " | Train model]")
    for step in np.arange(num_steps):
                        
        offset = (step * batch_size) % (num_examples - batch_size)
        X_batch = train_dataset[offset:(offset + batch_size), :]
        t_batch = train_labels[offset:(offset + batch_size), :]
        feed_dict = {
            X : X_batch,
            t : t_batch,
            L2_reg : best_L2_reg,
            learning_rate : best_learning_rate
        }
        
        _, l, pred_batch = session.run( [optimizer, loss, train_predictions], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("> Minibatch loss at step %d: %f" % (step, l))
            print("> Training f1: %.1f%%" % score(pred_batch, t_batch))
            print("> Validation f1: %.1f%%" % score(val_predictions.eval(), val_labels))
                
    f1_test = score(test_predictions.eval(), test_labels)
    print("|> Test f1: %.1f%%" % f1_test)