# Packages

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis 
from tqdm import tqdm_notebook as tqdm
from copy import deepcopy
from time import time

import sys

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset,DataLoader
import torch.optim as optim
from google.colab import files
import os 
from torchvision import transforms

import xgboost as xgb
import sklearn.metrics 
from sklearn.model_selection import train_test_split
import warnings
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint, uniform

from collections import OrderedDict

In [0]:
GPU = True
device_idx = 0
use_cuda = True

if GPU:
    device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu")
else:
    device = torch.device("cpu")
print(device)

cuda:0


# Dataset

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

#sys.path.insert(0, "gdrive/My Drive/Earthquakes/LayerGenerator.py")
from LayerGenerator import *

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
path = "gdrive/My Drive/Earthquakes/train.csv"
file = open(path,"r")
files = [open("gdrive/My Drive/Earthquakes/Split_Files/tempfile.part.0"+str(i),"r") for i in range(7)]
paths = ["gdrive/My Drive/Earthquakes/Split_Files/tempfile.part.0"+str(i) for i in range(7)]

rows = 629145480

# CNN Architecture

In [0]:
# import model class
from CNN_model import CNN_model

In [0]:
# intialise the model and run a dummy input through it as a quick bug check 
x = torch.randint(-15, 15, (5,1,150000)).float()  
model = CNN_model()
print(model(x))

tensor([[0.0000],
        [0.0634],
        [0.0542],
        [0.0000],
        [0.2513]], grad_fn=<ReluBackward0>)


#Training Set-up

## Data Loading

In [0]:
def load_data(d):
    print("Loading training data... ", end="")
    start_time = time()
    
    if d["start"]==0 or d["start"]==1:
        data = pd.read_csv(d["file"], skiprows = d["start"], nrows = 50000000, names = ["X", "y"])
    else:
        data = pd.read_csv(d["file"], skiprows = d["start"], names = ["X", "y"])
    print("Done! (time: {:.1f}s)".format(time()-start_time))
    return data

    return data_new

def sample_batch(data, batch_size):
    unit = 150000
  
    n = data.shape[0]
    batches_X = np.empty((batch_size,unit))
    batches_y = np.empty((batch_size,1))
    for i in range(batch_size):

        start = np.random.randint(0, n-unit-1)
        obs = data.iloc[start:start+unit]

        X = obs.X
        y = obs.y.iloc[-1]
        del obs
        
        #Normalize the data (train)
        mean = X.mean(axis=0)
        std = X.std(axis=0)
        X = (X - mean)/std
            
        batches_X[i,:] = X
        batches_y[i,:] = y
    X = torch.from_numpy(np.array(batches_X)).float()
    y = torch.from_numpy(np.array(batches_y)).float()
    return X, y

## Training function

In [0]:
def train(inputs, targets, model, optimizer):

    start = time()

    #Re-start optimizer
    optimizer.zero_grad()

    #Do forward pass
    inputs = inputs.view(-1,1,unit).to(device)
    targets = targets.to(device)

    outputs = model(inputs)

    # Computes loss on batch:
    loss = F.l1_loss(outputs, targets)

    # Do backward pass
    loss.backward()

    #Optimizer updates model parameters using computed gradient.
    optimizer.step()

    torch.cuda.empty_cache()

    return loss.mean()

In [0]:
def validate(model):

    print("Loading validation data... ", end="")
    
    X, y, _ = load_data(file, 6000000, 6000000+1500001, 1500000, 0)
    y_idx = np.array(range(150000-1,y.shape[0],150000))
    y = y[y_idx]
    
    print("Done!")
    val_loss = 0.0

    with torch.no_grad():
        outputs = model(X.view(-1,1,150000))
        val_loss += F.l1_loss(outputs.view(-1), y, reduction="sum").item()

    val_loss /= 10

    print("Validation set: Average loss: {:.4f}\n".format(val_loss))
    return val_loss

In [0]:
def train_round(d_load,model,optimizer,batch_size,batches,interval):
    #Load data
    data = load_data(d_load)
    
    losses = []
    
    cum_loss = 0
    c=0
    start = time()
    start_batch = 0
    
    for batch_id in range(batches):
        #Get batch
        inputs, targets = sample_batch(data,batch_size)
        
        #Train on the batch
        loss = train(inputs, targets, model, optimizer)
        
        losses.append(loss.item())
        
        cum_loss += loss.item()
        c+=1
        
        if (batch_id+1)%interval==0:
            print("Batches: {:0>3d} to {:0>3d} -- Loss: {:.4f} -- Time: {:.2f}s".format(start_batch+1,batch_id+1,round(cum_loss/c,4),time()-start))
            cum_loss = c = 0
            start = time()
            start_batch = batch_id+1
    
    del data
    
    return(losses)

# Training Process

Pseudo-code:
1. Load 10% of data to RAM.
2. Total random batches = enough to cover all data (balance between batch size and batches)
Train - accepts batch, does training round, returns loss
3. Re-mount to drive

## Training set-up

In [0]:
batch_size = 16
batches = 128
interval = 16

model_save_name = "CNN v3.pt"
path = F"gdrive/My Drive/Earthquakes/{model_save_name}"

model = CNN_model().to(device)
torch.save(model.state_dict(),path)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.1)

params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of parameters is: {:,}".format(params))

ds = [{
    "file_no": i//2,
    "start": (i%2)*50000000
} for i in range(13)]

ds[0]["start"]=1

Total number of parameters is: 16,843,361


In [0]:
unit = 150000
losses = []
for epoch in range(10):
    model.load_state_dict(torch.load(path))
    for i, d_load in enumerate(ds):
        print("\n~~~~~~~~ Training round {}.{} ~~~~~~~~".format(epoch+1,i+1))
        files = [open("gdrive/My Drive/Earthquakes/Split_Files/tempfile.part.0"+str(i),"r") for i in range(7)]
        d_load["file"] = files[d_load["file_no"]]
        l = train_round(d_load,model,optimizer,batch_size,batches,interval)
        losses.append(l)
    torch.save(model.state_dict(),path)


~~~~~~~~ Training round 1.1 ~~~~~~~~
Loading training data... Done! (time: 13.4s)
Batches: 001 to 016 -- Loss: 5.0000 -- Time: 11.74s
Batches: 017 to 032 -- Loss: 4.4933 -- Time: 11.82s
Batches: 033 to 048 -- Loss: 4.1864 -- Time: 11.81s
Batches: 049 to 064 -- Loss: 3.9123 -- Time: 11.84s
Batches: 065 to 080 -- Loss: 3.6099 -- Time: 11.70s
Batches: 081 to 096 -- Loss: 3.4740 -- Time: 11.71s
Batches: 097 to 112 -- Loss: 3.1198 -- Time: 11.65s
Batches: 113 to 128 -- Loss: 3.4828 -- Time: 11.61s

~~~~~~~~ Training round 1.2 ~~~~~~~~
Loading training data... Done! (time: 15.9s)
Batches: 001 to 016 -- Loss: 4.7239 -- Time: 11.71s
Batches: 017 to 032 -- Loss: 4.0131 -- Time: 11.77s
Batches: 033 to 048 -- Loss: 3.7858 -- Time: 11.85s
Batches: 049 to 064 -- Loss: 3.1187 -- Time: 12.02s
Batches: 065 to 080 -- Loss: 3.1535 -- Time: 11.75s
Batches: 081 to 096 -- Loss: 2.6458 -- Time: 11.67s
Batches: 097 to 112 -- Loss: 3.1060 -- Time: 11.64s
Batches: 113 to 128 -- Loss: 2.5523 -- Time: 11.66s

~

KeyboardInterrupt: ignored

## Comments and results

The model works reasonably well but there are a some issues that need to be addressed: 

* Due to the extremely high volume of data and the import methodology, after being done with training on a tranche of the data the model seems to overfit on it - when moving to the next tranche the loss usually starts higher before dropping back to the long-term mean. This is a non-trivial issue and means another methodology of streaming the data to the model needs to be devised.

* On some tranches the model does significantly better than on others, up to 3 times better. We need to visualise the data and model output to form an idea of why this is.

* Validation is not yet implemented properly - while at this stage we're more interested in identifying a model with the explanatory capacity required to achieve the desired level of accuracy (even if it overfits), we will need to 

* The model would score somewhere in the middle on the Kaggle ranking with the resuts achieved here. We have implemented and trained better models (which will be added here after the competition ends), however the major challenge remains identifying a model with the capacity to model the process sufficiently well while still running on the limited processing power we have available using free resources.