This is the final Kaggle version version.\
This notebook has only the final model. To see why we decided to run this model, please see the other notebooks.

In [7]:
import time
import pandas as pd
import numpy as np
import re
import random
import pickle
from sklearn.mixture import GaussianMixture

# Instantly make your loops show a smart progress meter - just wrap any iterable with tqdm(iterable), and you're done!
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

import torch
import torch.nn as nn
import torch.nn.functional as F

In [8]:
#load the Data
#Load the data. The path is the same for everyone.
# train_features_set = pd.read_csv('../input/lish-moa/train_features.csv')
# train_labels_set = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
# test_features = pd.read_csv('../input/lish-moa/test_features.csv')

train_features_set = pd.read_csv('train_features.csv') 
train_labels_set = pd.read_csv('train_targets_scored.csv')
test_features = pd.read_csv('test_features.csv')

In [9]:
SEED = 1

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [10]:
'''
Lets separate train_features and train_targets_scored into train_set and validation_set.
This is important to avoid overfitting.
'''

#Remove control observations.
train_features_without_control = train_features_set[train_features['cp_type'] != 'ctl_vehicle'].copy()
train_labels_without_control = train_labels_set.iloc[train_features_without_control.index].copy()

#reset index. 
train_features_without_control.reset_index(inplace = True)
train_features_without_control.drop('index', axis = 1, inplace = True)

train_labels_without_control.reset_index(inplace = True)
train_labels_without_control.drop('index', axis = 1, inplace = True)


# For the test dataset, we will remove just the column 'cp_type'.
# However, we will not delete the observations. 
# Instead, we will create a separate data frame with 'cp_type' so we can set all MoA equal to zero for these observations.


test_features_without_control = test_features.drop(['cp_type'], axis = 1).copy()
test_features_control = test_features[['sig_id', 'cp_type']].copy()


#These are the same adjustments used before.
train_x = train_features_without_control.drop(['sig_id', 'cp_type'], axis = 1).copy()
test_x = test_features_without_control.drop(['sig_id'], axis = 1).copy()
test_x_control = test_features_control.copy()

train_y = train_labels_without_control.drop(['sig_id'], axis = 1).copy()

train_x['cp_dose'] = train_x['cp_dose'].str.extract(r"([1-2])", expand = True).astype(np.int8)
test_x['cp_dose'] = test_x['cp_dose'].str.extract(r"([1-2])", expand = True).astype(np.int8)

In [None]:
'''
Some auxiliary function for our model.
Important: we are using Adam optimizer, and F.binary_cross_entropy as our loss function.
'''
def log_loss_mean(y_real, predictions):
    y_real_db = pd.DataFrame(data = y_real)
    
    log_loss_aux = []
    for col in y_real_db:
        log_loss_aux.append(log_loss(y_real_db[col].astype(float), predictions[col].astype(float), labels = [0, 1]))
        
    return np.mean(log_loss_aux)

'''
From: https://github.com/Varal7/ml-tutorial/blob/master/Part2.ipynb
In most ML applications we do mini-batch stochastic gradient descent instead of pure stochastic gradient descent.

Mini-batch SGD is a step between full gradient descent and stochastic gradient descent by computing the average gradient over a small number of examples.

In a nutshell, given n examples:

Full GD: dL/dw = average over all n examples. One step per n examples.
SGD: dL/dw = point estimate over a single example. n steps per n examples.
Mini-batch SGD: dL/dw = average over m << n examples. n / m steps per n examples.
Advantages of mini-batch SGD include a more stable gradient estimate and computational efficiency on modern hardware (exploiting parallelism gives sub-linear to constant time complexity, especially on GPU).

In PyTorch, batched tensors are represented as just another dimension. Most of the deep learning modules assume batched tensors as input (even if the batch size is just 1).

Code from: MITx: 6.86x (Project 3)
'''
def batchify_data(x_data, y_data, batch_size):
    """Takes a set of data points and labels and groups them into batches."""
    # Only take batch_size chunks (i.e. drop the remainder)
    N = int(len(x_data) / batch_size) * batch_size
    batches = []
    for i in range(0, N, batch_size):
        batches.append({
            'x': torch.tensor(x_data[i:i+batch_size], dtype=torch.float32),
            'y': torch.tensor(y_data[i:i+batch_size], dtype=torch.long
        )})
    return batches

def compute_accuracy(predictions, y):
    """Computes the accuracy of predictions against the gold labels, y."""
    predictions_np = predictions.detach().numpy()
    y_np = y.detach().numpy()
    return np.mean(np.equal(predictions_np, y_np))


# Training Procedure
def train_model_SGD(train_data, dev_data, model, lr=0.01, momentum=0.9, nesterov = False, n_epochs=30, save_model = 'FNN_SGD'):  # If we use SGD.
    """Train a model for N epochs given data and hyper-params."""
    # We optimize with SGD
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, nesterov=nesterov)

    for epoch in range(1, n_epochs):
        print("-------------\nEpoch {}:\n".format(epoch))


        # Run **training***
        train_loss, train_acc, train_out = run_epoch(train_data, model.train(), optimizer)
        print('Train loss: {:.6f} | Train accuracy: {:.6f}'.format(train_loss, train_acc))

        # Run **validation**
        dev_loss, dev_acc, dev_out = run_epoch(dev_data, model.eval(), optimizer)
        print('Val loss:   {:.6f} | Val accuracy:   {:.6f}'.format(dev_loss, dev_acc))
        # Save model
        torch.save(model, save_model + '.pt')
    return 


def train_model_Adam(train_data, dev_data, model, lr = 0.01, weight_decay = 1e-5, n_epochs=30, save_model = 'FNN_Adam'):  # If we use Adam.
    """Train a model for N epochs given data and hyper-params."""
    optimizer = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = weight_decay)

    for epoch in range(1, n_epochs):
        print("-------------\nEpoch {}:\n".format(epoch))


        # Run **training***
        train_loss, train_acc, train_out = run_epoch(train_data, model.train(), optimizer)
        print('Train loss: {:.6f} | Train accuracy: {:.6f}'.format(train_loss, train_acc))

        # Run **validation**
        dev_loss, dev_acc, dev_out = run_epoch(dev_data, model.eval(), optimizer)
        print('Val loss:   {:.6f} | Val accuracy:   {:.6f}'.format(dev_loss, dev_acc))
        # Save model
        torch.save(model, save_model + '.pt')
    return 

def run_epoch(data, model, optimizer):
    """Train model for one pass of train data, and return loss, acccuracy"""
    # Gather losses
    losses = []
    batch_accuracies = []

    # If model is in train mode, use optimizer.
    is_training = model.training

    # Iterate through batches
    for batch in tqdm(data):
        # Grab x and y
        x, y = batch['x'], batch['y']
       
        # Get output predictions
        out = model(x)
        y = y.type_as(out)
        
        
        # Predict and store accuracy      
        predictions = torch.round(out)
        batch_accuracies.append(compute_accuracy(predictions, y))


        # Compute loss
        loss = F.binary_cross_entropy(out, y)
        losses.append(loss.data.item())

        # If training, do an update.
        if is_training:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Calculate epoch level scores
    avg_loss = np.mean(losses)
    avg_accuracy = np.mean(batch_accuracies)
    return avg_loss, avg_accuracy, out


In [None]:
# Final model 1 hidden layer SGD
start = time.time()

X_train = train_x.to_numpy(copy = True)
X_dev = test_x.to_numpy(copy = True)

Y_train = train_y.to_numpy(copy = True)
Y_dev = test_y.to_numpy(copy = True)

n_epochs = 290               #Hyperparameter
batch_size = 32             #Hyperparameter
 
train_batches = batchify_data(X_train, Y_train, batch_size)
dev_batches = batchify_data(X_dev, Y_dev, batch_size)

train_batches_full_base = torch.tensor(X_train, dtype=torch.float32)


#################################
## Model specification
model = nn.Sequential(
                    nn.Linear(X_train.shape[1], int(0.5*X_train.shape[1])), 
                    nn.ReLU6(),  
                    nn.Linear(int(0.5*X_train.shape[1]), 206),  
                    nn.Sigmoid(),  #So we have probabilities at the end
                )
lr = 0.1 # Hyperparameter
momentum = 0
##################################

#Train model
train_model_SGD(train_batches, model, lr = lr, momentum = momentum, nesterov = False, n_epochs = n_epochs, save_model = 'FNN_layer_1')

# Get output.
# Evaluate the model on test data.
out_train = model(train_batches_full_base) 
out_dev = model(dev_batches_full_base)    

out_train = out_train.detach().numpy()
out_train = pd.DataFrame(data = out_train)

# Adjust out_dev for the control group
out_dev = out_dev.detach().numpy()
# Make a matrix with zeros when control.
treatment_flag = test_x_control['cp_type'] == 'trt_cp'
treatment_flag = np.array([treatment_flag])
treatment_flag = np.repeat(treatment_flag.transpose(), 206, axis = 1)
out_dev = out_dev*treatment_flag
out_dev = pd.DataFrame(data = out_dev)
    
print()
print('Log loss train:', log_loss_mean(Y_train, out_train))

#Add columns name
out_dev.columns = train_y.columns
#Add column 'sig_id'
out_dev = test_x_control[['sig_id']].join(out_dev)
#Submit
# out_dev.to_csv('submission_SGD.csv', index = False) 
out_dev.to_csv('submission.csv', index = False) 

# Log loss train: 0.013825678791504065
# Log loss dev: 0.01545215945048286    (Considering 20% of the train database)
# Log loss LB: 0.01972

end = time.time()
elapsed = end - start
print()
print('elapsed time:',elapsed)