<a href="https://colab.research.google.com/github/Machine-Learning-Tokyo/Kaggle/blob/master/Earthquake_Prediction/data_downloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Code to download the data

In [0]:
!pip install -U -q kaggle
!mkdir -p .kaggle
!mkdir -p ~/.kaggle

In [0]:
import json
token = {"username":"dkatsios","key":"684d38a2e65bba3b1fcaecf462caae75"}
with open('/content/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json

In [0]:
%%capture
!kaggle config set -n path -v{/content}
!chmod 600 /root/.kaggle/kaggle.json

In [0]:
!kaggle competitions download -c LANL-Earthquake-Prediction -p /content

In [0]:
!ls /content/

In [0]:
!unzip train.csv.zip && rm train.csv.zip
!unzip test.zip -d test && rm test.zip

## Code after downloading the data

In [0]:
import os
from os import listdir
from glob import glob
import csv
import pandas as pd
import numpy as np

import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Conv1D, Dense, MaxPool1D, BatchNormalization, Dropout, ReLU
import keras.backend as K

In [0]:
train_csv_path = 'train.csv'
test_folder = 'test/'
model_path = 'model_0.h5'
sub_csv_path = 'results_0.csv'

total_rows = 629145481
batch_size = 256
steps_per_epoch = (total_rows // batch_size) - 2
epochs = 1

k_size = 5
n_kernels = 4
max_n_kernels = 2**11
n_layers = 9
chunk_size = 2**n_layers

### Define the data loaders objects (generators)

In [0]:
class CSVReader:
  def __init__(self, csv_path, chunk_size, batch_size, only_last_t=True):
    self.only_last_t = only_last_t
    self.chunk_size = chunk_size
    self.batch_size = batch_size
    
    self.datareader = pd.read_csv(csv_path, chunksize=chunk_size)
    
  def __iter__(self):
    return self
  
  def get_chunk(self):
    chunk = next(self.datareader).values
    vs, ts = chunk[:, 0], chunk[:, 1]
    return (vs, ts[-1]) if self.only_last_t else (vs, ts)
  
  def __next__(self):
    vs = np.zeros((self.batch_size, self.chunk_size))
    ts = np.zeros((self.batch_size, 1 if self.only_last_t else self.chunk_size))
    for i in range(self.batch_size):
      vs[i], ts[i] = self.get_chunk()
    return np.expand_dims(vs, -1), np.expand_dims(ts, -1)
  
  
class CSVTester:
  def __init__(self, csv_path, chunk_size, batch_size):
    self.chunk_size = chunk_size
    self.batch_size = batch_size
    
    self.datareader = pd.read_csv(csv_path, chunksize=chunk_size)
    
  def __iter__(self):
    return self
  
  def get_chunk(self):
    chunk = next(self.datareader).values
    return chunk
  
  def __next__(self):
    batch = np.zeros((self.batch_size, self.chunk_size, 1))
    for i in range(self.batch_size):
      batch[i] = self.get_chunk()
    return batch

### Define the model

In [0]:
def get_model(chunk_size, n_kernels, max_n_kernels, k_size, n_layers):
  
  def my_block(x, drp):
    x = Conv1D(min(max_n_kernels, n_kernels*2**i), k_size, padding='same')(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    if drp: x = Dropout(0.5)(x)
    x = MaxPool1D()(x)
    return x
  
  x = input = Input((chunk_size, 1))
  
  for i in range(1, n_layers+1):
    x = my_block(x, i < n_layers)
    
  output = Dense(1)(x)
  model = Model(input, output)
  
  return model

### Define the inference and results generation functions

In [0]:
def get_pred_for_csv(model, csv_path, chunk_size, batch_size):
  csv_reader = CSVTester(csv_path, chunk_size, batch_size)
  preds = []
  try:
    for batch in csv_reader:
        pred = model.predict_on_batch(batch)
        preds.extend(pred)
  except ValueError:
    pass
  return np.mean(preds)


def produce_sub_file(test_folder, sub_csv_path, model, chunk_size, batch_size):
  test_csv_paths = glob(test_folder + '*')
  with open(sub_csv_path, 'w') as f:
    line = 'seg_id,time_to_failure'
    f.write(line)
    
    for i, test_csv_path in enumerate(test_csv_paths, 1):
      test_id = os.path.basename(test_csv_path).replace('.csv', '')
      pred = get_pred_for_csv(model, test_csv_path, chunk_size, batch_size)
      line = '\n%s,%f' % (test_id, pred)
      print('\r%s (%d/%d)' % (line[2:], i, len(test_csv_paths)), end='')
      f.write(line)

### Test the data loader and batch shapes

In [0]:
csv_reader = CSVReader(train_csv_path, chunk_size, batch_size)
vs, ts = next(csv_reader)
vs.shape, ts.shape

### Build and compile the model and the data generator (it is very simplistic approach so no validation generator is used...)

In [0]:
K.clear_session()
model = get_model(chunk_size, n_kernels, max_n_kernels, k_size, n_layers)
model.summary()

In [0]:
model.compile('adam', 'mse', metrics=['acc'])
csv_reader = CSVReader(train_csv_path, chunk_size, batch_size)

### Train the model

In [0]:
model.fit_generator(csv_reader, steps_per_epoch=steps_per_epoch, epochs=epochs)
model.save(model_path)

### Produce and download the output file for submission

In [0]:
produce_sub_file(test_folder, sub_csv_path, model, chunk_size, batch_size)

In [0]:
from google.colab import files
files.download(sub_csv_path) 