In [1]:
"""Colab Drive Connection"""

from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
import warnings
import os 

from collections import defaultdict
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

import networkx as nx
import scipy.cluster.hierarchy as sch
from scipy.cluster.hierarchy import fcluster

import random

from tqdm import tqdm_notebook

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F

# install datatable
!pip install datatable
import datatable as dt

from numba import njit

import gc

warnings.simplefilter(action="ignore")

# project_home = "/kaggle/input/jane-street-market-prediction"

project_home = "/gdrive/MyDrive/colab/jane-street-market-prediction"
data_home = os.path.join(project_home, "input/data")
model_home = os.path.join(project_home, "output/model")



In [3]:
entire_seed = 1029

def seed_torch(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
#     torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.deterministic = False
    
seed_torch(entire_seed)

In [4]:
train_file = os.path.join(data_home,'train.csv')
features_file = os.path.join(data_home,'features.csv')
example_test_file = os.path.join(data_home,'example_test.csv')
example_sample_submission_file = os.path.join(data_home,'example_sample_submission.csv')

train_data_datatable = dt.fread(train_file)

df_train = train_data_datatable.to_pandas()
df_features = pd.read_csv(features_file)
df_example_test = pd.read_csv(example_test_file)
df_example_sample_submission = pd.read_csv(example_sample_submission_file)

In [5]:
features = [ col for col in df_train.columns if "feature" in col ]
resps = [ col for col in df_train.columns if "resp" in col ]
target_resp = [resp_ for resp_ in resps if "_" not in resp_]
target = ["weight"] + target_resp + features 

In [6]:
"""
Reduce Memory Usage by 75%
https://www.kaggle.com/tomwarrens/nan-values-depending-on-time-of-day
"""

## Reduce Memory

def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
#                 reducing float16 for calculating numpy.nanmean
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

df_train = reduce_memory_usage(df_train)
df_train.info()

Memory usage of dataframe is 2489.4869804382324 MB
Memory usage of dataframe after reduction 1247.0233011245728 MB
Reduced by 49.908422461199 % 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2390491 entries, 0 to 2390490
Columns: 138 entries, date to ts_id
dtypes: float32(135), int16(1), int32(1), int8(1)
memory usage: 1.2 GB


In [7]:
# drop before 85days
df_train = df_train.loc[df_train.date>85]
# drop weight 0 for training
df_train = df_train.loc[df_train.weight > 0]

df_labels = df_train[['date','weight','resp_1','resp_2','resp_3','resp_4','resp']]

df_train = df_train.drop(df_labels.columns,axis=1)

In [8]:
"""
The codes from 'Optimise Speed of Filling-NaN Function'
https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function
"""

def for_loop(method, matrix, values):
    for i in range(matrix.shape[0]):
        matrix[i] = method(matrix[i], values)
    return matrix

def for_loop_ffill(method, matrix):
    tmp = np.zeros(matrix.shape[1],dtype=np.float32)
    for i in range(matrix.shape[0]):
        matrix[i] = method(matrix[i], tmp)
        tmp = matrix[i]
    return matrix

@njit
def fillna_npwhere_njit(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [9]:
# converting numpy for efficient calcualtion.
# ft 1~129
np_train = df_train.loc[:,features[1:]].values
np_train.shape

# ft 0
np_train_ft0 = df_train.loc[:,features[0]].values

In [10]:
# nead pre-calculate 1.2GB per action
f_mean = np.nanmean(np_train,axis=0)

In [11]:
print('fillna_npwhere_njit (mean-filling):')
np_mf_train = for_loop(fillna_npwhere_njit, np_train, f_mean)

fillna_npwhere_njit (mean-filling):


In [12]:
np_train = np.concatenate([np_train_ft0.reshape(-1,1),np_mf_train],axis=1)
# resp_{1~4}, resp 모두를 고려; 각각을 0과 1로 분류하는 개별적인 Binary Classification 문제로 간주
# ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp'] 순서
np_targets = np.stack([(df_labels[c] > 0).astype('int') for c in resps]).T

In [13]:
class JaneDataset(Dataset):
    def __init__(self, np_X, np_y):
        super(JaneDataset,self).__init__()
        self.X = np_X
        self.y = np_y
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, index):
        X = torch.tensor(self.X[index,:],dtype=torch.float)
        y = torch.tensor(self.y[index],dtype=torch.float)
        return X,y

In [14]:
dataset = JaneDataset(np_train, np_targets)

In [15]:
train_size = int(len(dataset) * 0.8)
valid_size = len(dataset) - train_size

train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size], generator=torch.Generator().manual_seed(entire_seed))

* 1d-cnn

https://wikidocs.net/80437

https://ratsgo.github.io/deep%20learning/2017/10/09/CNNs/

https://medium.com/@Rehan_Sayyad/how-to-use-convolutional-neural-networks-for-time-series-classification-80575131a474

https://arxiv.org/abs/1905.03554

https://www.kaggle.com/pyoungkangkim/1dcnn-pytorch-jstreet

https://www.kaggle.com/a763337092/pytorch-resnet-starter-training


dense1 => 130 -> 512

512 -> 16ch, 32

cnn1 => 16ch,32 => 32ch, 16

dense2 => 32,16 -> 512 -> 5


In [16]:
class Model_1DCNN(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model_1DCNN, self).__init__()
        
        self.hidden_size = hidden_size
        # num_channel
        self.ch_input = 16
        self.ch_output =  32
        self.points = int(self.hidden_size / self.ch_input) 
        
        # feature_size to hidden_size
        self.bn_dense1 = nn.BatchNorm1d(num_features)
        self.dropout_dense1 = nn.Dropout(0.2)
        self.dense1 = nn.Linear(num_features, hidden_size)
        
        # reshaped hidden_size [input_channel, data] to [input_channel, output_channel] 
        self.bn_c1 = nn.BatchNorm1d(self.ch_output)
        self.dropout_c1 = nn.Dropout(0.2)
        self.conv1 = nn.Conv1d(self.ch_input, self.ch_output, padding=1, kernel_size=3, stride=1)
        self.avg_pool_c1 = nn.MaxPool1d(kernel_size=5,stride=2,padding=2)
        
        self.flatten = nn.Flatten()
        
        self.dense2 = nn.Linear(512,num_targets)

    def forward(self, x):
        x = self.bn_dense1(x)
        x = self.dropout_dense1(x)
        x = self.dense1(x)
        
        x = x.reshape(x.size(0), self.ch_input, self.points)
        
        x = self.conv1(x)
        x = F.relu(self.bn_c1(x))
        x = self.dropout_c1(x)
        x = self.avg_pool_c1(x)
       
        x = self.flatten(x)

        x = self.dense2(x)
        x = F.sigmoid(x)

        return x

In [17]:
epochs = 10
batch_size = 4096
learning_rate = 0.001

model = Model_1DCNN(num_features=130, num_targets=5, hidden_size=512)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [18]:
# model(torch.tensor(dummy))

In [19]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False) 
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [20]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [21]:
class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score: #  + self.delta
            self.counter += 1
            # print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            # ema.apply_shadow()
            self.save_checkpoint(epoch_score, model, model_path)
            # ema.restore()
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score:.4f, epoch_score:.4f))
            # if not DEBUG:
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [22]:
EARLYSTOP_NUM = 7
CACHE_PATH = model_home

model = model.to(device)

es = EarlyStopping(EARLYSTOP_NUM, mode="max")
for epoch in tqdm_notebook(range(epochs)):

    running_loss = 0.0
    running_acc = 0.0
    running_auc = 0.0
    model.train()
    
    for idx, (inputs, labels) in enumerate(train_dataloader):
    
        optimizer.zero_grad()

        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        
        true = labels.detach().cpu().numpy()[:,-1]
        target = np.array(list(map(lambda x: 1 if x > 0.5 else 0, outputs.detach().cpu().numpy()[:,-1])),dtype=np.float)
        
        acc = (true == target).sum() / outputs.shape[0]
        auc = roc_auc_score(true, outputs.detach().cpu().numpy()[:,-1])
 
        running_acc += acc
        running_auc += auc

        loss = criterion(outputs,labels)
        running_loss += loss.detach().item() * inputs.size(0)
        loss.backward()
        optimizer.step()
        
    epoch_loss = running_loss / len(train_dataloader.dataset)
    epoch_acc = running_acc / len(train_dataloader)
    epoch_auc = running_auc / len(train_dataloader)

    with torch.no_grad():
        model.eval()
        running_loss = 0.0
        running_acc = 0.0
        running_auc = 0.0
        for idx, (inputs, labels) in enumerate(valid_dataloader):

            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            true = labels.detach().cpu().numpy()[:,-1]
            target = np.array(list(map(lambda x: 1 if x > 0.5 else 0, outputs.detach().cpu().numpy()[:,-1])),dtype=np.float)
            
            acc = (true == target).sum() / outputs.shape[0]
            auc = roc_auc_score(true, outputs.detach().cpu().numpy()[:,-1])

            running_acc += acc
            running_auc += auc

            loss = criterion(outputs, labels)
            running_loss += loss.detach().item() * inputs.size(0)
            
        valid_loss = running_loss / len(valid_dataloader.dataset)
        valid_acc = running_acc / len(valid_dataloader)
        valid_auc = running_auc / len(valid_dataloader)

    print(f"EPOCH:{epoch+1}|{epochs}; loss(train/valid):{epoch_loss:.4f}/{valid_loss:.4f}; acc(train/valid):{epoch_acc:.4f}/{valid_acc:.4f}; auc(train/valid):{epoch_auc:.4f}/{valid_auc:.4f}")
    
    model_weights = os.path.join(model_home,f"online_model.pth")
    es(valid_auc, model, model_path=model_weights)
    if es.early_stop:
      print("Early stopping")
      break

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Validation score improved (-inf --> 0.5261804774011822). Saving model!
Validation score improved (0.5261804774011822 --> 0.5292934903823296). Saving model!
Validation score improved (0.5292934903823296 --> 0.530568250100935). Saving model!
Validation score improved (0.530568250100935 --> 0.5315578757069889). Saving model!
Validation score improved (0.5315578757069889 --> 0.5324267436539989). Saving model!
Validation score improved (0.5324267436539989 --> 0.5332396648633788). Saving model!
Validation score improved (0.5332396648633788 --> 0.5339353862784003). Saving model!
Validation score improved (0.5339353862784003 --> 0.5344012815033952). Saving model!
Validation score improved (0.5344012815033952 --> 0.5349396695231505). Saving model!
Validation score improved (0.5349396695231505 --> 0.5352470403368991). Saving model!



In [23]:
from tqdm import tqdm_notebook

import janestreet
env = janestreet.make_env()

# learn.model.eval()
preds = []
for (test_df, pred_df) in tqdm_notebook(env.iter_test()):
    if test_df['weight'].item() > 0:
        test_np = test_df.loc[:, features].values
        test_np[:, 1:] = for_loop(fillna_npwhere_njit, test_np[:, 1:], f_mean)
        pred = torch.mean(model(torch.tensor(test_np, dtype=torch.float).cuda(device))).item()
        preds.append(pred)
        action = 1 if pred >= .5 else 0
        pred_df.action = action
    else:
        pred_df.action = 0
    env.predict(pred_df)

ModuleNotFoundError: ignored

In [None]:
preds = np.array(preds)
preds.mean(), preds.std(), sum(preds >= .5), sum(preds < 5)