In [1]:
"""Colab Drive Connection"""

from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
import warnings
import os 

from collections import defaultdict
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

import networkx as nx
import scipy.cluster.hierarchy as sch
from scipy.cluster.hierarchy import fcluster

import random

from tqdm import tqdm_notebook

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F

# install datatable
!pip install datatable
import datatable as dt

from numba import njit

import gc

warnings.simplefilter(action="ignore")

# project_home = "/kaggle/input/jane-street-market-prediction"

project_home = "/gdrive/MyDrive/colab/jane-street-market-prediction"
data_home = os.path.join(project_home, "input/data")
model_home = os.path.join(project_home, "output/model")

Collecting datatable
[?25l  Downloading https://files.pythonhosted.org/packages/80/cb/21810c43b687a19d194c372192049f535fba28c55ce76d37e7e407159c52/datatable-0.11.1-cp36-cp36m-manylinux2010_x86_64.whl (83.7MB)
[K     |████████████████████████████████| 83.7MB 65kB/s 
[?25hInstalling collected packages: datatable
Successfully installed datatable-0.11.1


In [3]:
entire_seed = 1029

def seed_torch(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
#     torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.deterministic = False
    
seed_torch(entire_seed)

In [4]:
train_file = os.path.join(data_home,'train.csv')
features_file = os.path.join(data_home,'features.csv')
example_test_file = os.path.join(data_home,'example_test.csv')
example_sample_submission_file = os.path.join(data_home,'example_sample_submission.csv')

train_data_datatable = dt.fread(train_file)

df_train = train_data_datatable.to_pandas()
df_features = pd.read_csv(features_file)
df_example_test = pd.read_csv(example_test_file)
df_example_sample_submission = pd.read_csv(example_sample_submission_file)

In [5]:
features = [ col for col in df_train.columns if "feature" in col ]
resps = [ col for col in df_train.columns if "resp" in col ]
target_resp = [resp_ for resp_ in resps if "_" not in resp_]
target = ["weight"] + target_resp + features 

In [6]:
"""
Reduce Memory Usage by 75%
https://www.kaggle.com/tomwarrens/nan-values-depending-on-time-of-day
"""

## Reduce Memory

def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
#                 reducing float16 for calculating numpy.nanmean
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

df_train = reduce_memory_usage(df_train)
df_train.info()

Memory usage of dataframe is 2489.4869804382324 MB
Memory usage of dataframe after reduction 1247.0233011245728 MB
Reduced by 49.908422461199 % 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2390491 entries, 0 to 2390490
Columns: 138 entries, date to ts_id
dtypes: float32(135), int16(1), int32(1), int8(1)
memory usage: 1.2 GB


In [7]:
# drop before 85days
df_train = df_train.loc[df_train.date>85]
# drop weight 0 for training
df_train = df_train.loc[df_train.weight > 0]

df_labels = df_train[['date','weight','resp_1','resp_2','resp_3','resp_4','resp']]

df_train = df_train.drop(df_labels.columns,axis=1)

In [8]:
"""
The codes from 'Optimise Speed of Filling-NaN Function'
https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function
"""

def for_loop(method, matrix, values):
    for i in range(matrix.shape[0]):
        matrix[i] = method(matrix[i], values)
    return matrix

def for_loop_ffill(method, matrix):
    tmp = np.zeros(matrix.shape[1],dtype=np.float32)
    for i in range(matrix.shape[0]):
        matrix[i] = method(matrix[i], tmp)
        tmp = matrix[i]
    return matrix

@njit
def fillna_npwhere_njit(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [9]:
# converting numpy for efficient calcualtion.
# ft 1~129
np_train = df_train.loc[:,features[1:]].values
np_train.shape

# ft 0
np_train_ft0 = df_train.loc[:,features[0]].values

In [10]:
data_home

'/gdrive/MyDrive/colab/jane-street-market-prediction/input/data'

In [11]:
# nead pre-calculate 1.2GB per action
# f_mean = np.nanmean(np_train,axis=0)
# create f_mean.npy
# np.save(os.path.join(data_home,"f_mean"), f_mean)

f_mean_path = os.path.join(data_home, "f_mean.npy")
f_mean = np.load(f_mean_path)

In [12]:
print('fillna_npwhere_njit (mean-filling):')
np_mf_train = for_loop(fillna_npwhere_njit, np_train, f_mean)

fillna_npwhere_njit (mean-filling):


In [13]:
np_train = np.concatenate([np_train_ft0.reshape(-1,1),np_mf_train],axis=1)
# resp_{1~4}, resp 모두를 고려; 각각을 0과 1로 분류하는 개별적인 Binary Classification 문제로 간주
# ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp'] 순서
np_targets = np.stack([(df_labels[c] > 0).astype('int') for c in resps]).T

In [14]:
class JaneDataset(Dataset):
    def __init__(self, np_X, np_y):
        super(JaneDataset,self).__init__()
        self.X = np_X
        self.y = np_y
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, index):
        X = torch.tensor(self.X[index,:],dtype=torch.float)
        y = torch.tensor(self.y[index],dtype=torch.float)
        return X,y

In [15]:
dataset = JaneDataset(np_train, np_targets)

In [16]:
train_size = int(len(dataset) * 0.8)
valid_size = len(dataset) - train_size

train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size], generator=torch.Generator().manual_seed(entire_seed))

## CNN(Convolutional Neural Network)

<div align="center">
<img src="imgs/cnn-arch.png">
<p>Basic CNN architecture for image classification</p>
</div>

### 1. 합성곱층

일반적으로 합성곱은 그림과 같이 연속적인 은닉층을 거쳐 특징을 조합하고 압축한다. 첫번째 은닉층에서는 작은 저수준 특성에 집중하고, 다음 은닉층에는 더 큰 고수준 톡성으로 조합해나가도록 한다. 이런 계층적 구조가 CNN이 이미지 인식에 잘 작동하는 이유 중에 하나이다.

합성곱층은 2차원 이미지를 기준으로 hxw크기를 가진 수용장(kernel)을 임의의 간격 stride를 두어 특성맵(feature map)을 생성한다. feature map을 생성함에 있어 stride값을 크게 주어 차원을 축소한 특성맵을 얻을 수도 있다. 

이러한 수용장(filter)를 이동하여 얻은 특성맵은 output filter로 정의한 개수만큼 filter를 가지고, 각각의 feature map은 2차원의 데이터이기 때문에 실질적으로 합성곱층을 거친 데이터는 3차원의 데이터로 표현된다. 

하나의 특성맵안에서는 모든 뉴런이 같은 파라미터를 공유하고, 다른 특성맵 간에는 다른 파라미터를 사용한다. 다시 말해 하나의 합성곱층이 입력에 여러 필터를 동시에 적용하여 입력에 있는 여러 특성을 감지하는 것이다.

### 2. 풀링층

풀링층의 목적은 계산량과 메모리 사용량 그리고 파라미터의 수를 줄이기 위해 부표본(축소본)을 만드는 것이다. 합성곱이 특정 feature map의 하나의 포인트(뉴런)를 얻는 것과 같이 이전 layer의 수용장에 있는 출력값과 연결되어 있다. 그러나 풀링 뉴런은 가중치가 없으므로, 합산함수를 사용하여 입력값을 더하는 것이 전부이다.

* 최대 풀링층
* 평균 풀링층
* 전역 평균 풀링층: 각 특성 맵의 평균을 계산함으로써 특성 맵의 대부부분의 정보를 잃게되지만 출력층에는 유용하다.

### 3. CNN의 기본 구조

1. 합성곱층을 몇개 쌓고(각각에 활성화함수추가)
2. 풀링층
3. 추가적인 연속적인 합성곱층
4. 풀링층

이를 통해 네트워크를 거칠 수록 이미지는 점점 작아지지만, 합성곱 층 때문에 일반적으로 점점 더 깊어진다(많은 특성 맵을 가진다). 

첫번째 층은 필터가 적더라도 그 크기를 크게 잡는다. (kernel_size)

이후의 층은 필터의 수를 점차적으로 늘이고, 그에 상응하여 크기를 줄인다.

* 1d-cnn

https://wikidocs.net/80437

https://ratsgo.github.io/deep%20learning/2017/10/09/CNNs/

https://medium.com/@Rehan_Sayyad/how-to-use-convolutional-neural-networks-for-time-series-classification-80575131a474

https://arxiv.org/abs/1905.03554

https://www.kaggle.com/pyoungkangkim/1dcnn-pytorch-jstreet

https://www.kaggle.com/a763337092/pytorch-resnet-starter-training

https://pulsar-kkaturi.tistory.com/entry/%ED%8C%8C%EC%9D%B4%ED%86%A0%EC%B9%98-%EC%B2%AB%EA%B1%B8%EC%9D%8C-5-%ED%95%A9%EC%84%B1%EA%B3%B1-%EC%8B%A0%EA%B2%BD%EB%A7%9D


In [17]:
# class Model_1DCNN(nn.Module):
#     def __init__(self, num_features, num_targets, hidden_size):
#         super(Model_1DCNN, self).__init__()
        
#         self.hidden_size = hidden_size
#         # num_channel
#         self.ch_input = 16
#         self.ch_output =  32
#         self.points = int(self.hidden_size / self.ch_input) 
        
#         # feature_size to hidden_size
#         self.bn_dense1 = nn.BatchNorm1d(num_features)
#         self.dropout_dense1 = nn.Dropout(0.2)
#         self.dense1 = nn.Linear(num_features, hidden_size)
        
#         # reshaped hidden_size [input_channel, data] to [input_channel, output_channel] 
#         self.bn_c1 = nn.BatchNorm1d(self.ch_output)
#         self.dropout_c1 = nn.Dropout(0.2)
#         self.conv1 = nn.Conv1d(self.ch_input, self.ch_output, kernel_size=5, padding=2, stride=2)
#         self.max_pool_c1 = nn.MaxPool1d(kernel_size=2)
        
#         self.bn_c2 = nn.BatchNorm1d(self.ch_output*2)
#         self.dropout_c2 = nn.Dropout(0.2)
#         self.conv2 = nn.Conv1d(self.ch_output, self.ch_output*2, kernel_size=3, padding=1, stride=1)
        
#         self.bn_c2_1 = nn.BatchNorm1d(self.ch_output*2)
#         self.dropout_c2_1 = nn.Dropout(0.2)
#         self.conv2_1 = nn.Conv1d(self.ch_output*2, self.ch_output*2, kernel_size=3, padding=1, stride=1)
        
#         self.max_pool_c2 = nn.MaxPool1d(kernel_size=2)

#         self.flatten = nn.Flatten()
        
#         self.bn_dense2 = nn.BatchNorm1d(256)
#         self.dropout_dense2 = nn.Dropout(0.2)
#         self.dense2 = nn.Linear(512,256)

#         self.bn_dense3 = nn.BatchNorm1d(num_targets)
#         self.dropout_dense3 = nn.Dropout(0.2)
#         self.dense3 = nn.Linear(256,num_targets)

#     def forward(self, x):
#         x = self.bn_dense1(x)
#         x = self.dropout_dense1(x)
#         x = self.dense1(x)
        
#         x = x.reshape(x.size(0), self.ch_input, self.points)
#         # print(x.shape)

#         x = self.conv1(x)
#         x = F.relu(self.bn_c1(x))
#         # print(x.shape)
#         x = self.dropout_c1(x)
#         x = self.max_pool_c1(x)
#         # print(x.shape)
        
#         x = self.conv2(x)
#         x = F.relu(self.bn_c2(x))
#         x = self.dropout_c2(x)
#         # print(x.shape)
        
#         x = self.conv2_1(x)
#         x = F.relu(self.bn_c2_1(x))
#         x = self.dropout_c2_1(x)
#         # print(x.shape)

#         x = self.max_pool_c2(x)
#         # print(x.shape)

#         x = self.flatten(x)
#         # print(x.shape)

#         x = self.dense2(x)
#         x = self.bn_dense2(x)
#         x = self.dropout_dense2(x)

#         x = self.dense3(x)
#         x = self.bn_dense3(x)
#         x = self.dropout_dense3(x)
        
#         x = F.sigmoid(x)

#         return x

In [18]:
# class Model_1DCNN(nn.Module):
#     def __init__(self, num_features, num_targets, hidden_size):
#         super(Model_1DCNN, self).__init__()
        
#         self.hidden_size = hidden_size
#         # num_channel
#         self.ch_input = 16
#         self.ch_output =  32
#         self.points = int(self.hidden_size / self.ch_input) 
        
#         # feature_size to hidden_size
#         self.bn_dense1 = nn.BatchNorm1d(num_features)
#         self.dropout_dense1 = nn.Dropout(0.2)
#         self.dense1 = nn.Linear(num_features, hidden_size)
        
#         # reshaped hidden_size [input_channel, data] to [input_channel, output_channel]
#         self.c1 = self.make_layers(self.ch_input,self.ch_output, 5, 2, 2, 0.2)
#         self.max_pool_c1 = nn.MaxPool1d(kernel_size=2)

#         self.c2 = self.make_layers(self.ch_output, self.ch_output*2, 3, 2, 1, 0.2)
#         self.c2_1 = self.make_layers(self.ch_output*2, self.ch_output*2, 3, 1, 1, 0.2)
#         self.c2_2 = self.make_layers(self.ch_output*2, self.ch_output*2, 3, 1, 1, 0.2)

#         self.max_pool_c2 = nn.MaxPool1d(kernel_size=2)

#         self.flatten = nn.Flatten()
        
#         self.bn_dense2 = nn.BatchNorm1d(64)
#         self.dropout_dense2 = nn.Dropout(0.2)
#         self.dense2 = nn.Linear(128,64)

#         self.bn_dense3 = nn.BatchNorm1d(num_targets)
#         self.dropout_dense3 = nn.Dropout(0.2)
#         self.dense3 = nn.Linear(64,num_targets)

#     def make_layers(self, ch_in, ch_out, kernel_size, stride, padding, dropout):
#         cnn_module = nn.Sequential(
#                         nn.Conv1d(ch_in,ch_out, kernel_size, stride, padding),
#                         nn.BatchNorm1d(ch_out),
#                         nn.ReLU(),
#                         nn.Dropout(dropout)
#                     )
        
#         return cnn_module

#     def forward(self, x):
#         x = self.bn_dense1(x)
#         x = self.dropout_dense1(x)
#         x = self.dense1(x)
        
#         x = x.reshape(x.size(0), self.ch_input, self.points)
#         print(x.shape)

#         x = self.c1(x)
#         print(x.shape)
#         x = self.max_pool_c1(x)
#         print(x.shape)
        
#         x = self.c2(x)
#         print(x.shape)
#         x = self.c2_1(x)
#         print(x.shape)
#         x = self.c2_2(x)
#         print(x.shape)
#         x = self.max_pool_c2(x)
#         print(x.shape)

#         x = self.flatten(x)
#         print(x.shape)

#         x = self.dense2(x)
#         x = self.bn_dense2(x)
#         x = self.dropout_dense2(x)

#         x = self.dense3(x)
#         x = self.bn_dense3(x)
#         x = self.dropout_dense3(x)
        
#         x = F.sigmoid(x)

#         return x

In [19]:
# class Model_1DCNN(nn.Module):
#     def __init__(self, num_features, num_targets, hidden_size):
#         super(Model_1DCNN, self).__init__()
        
#         self.hidden_size = hidden_size
#         # num_channel
#         self.ch_input = 16
#         self.ch_output =  32
#         self.points = int(self.hidden_size / self.ch_input) 
        
#         # feature_size to hidden_size
#         self.bn_dense1 = nn.BatchNorm1d(num_features)
#         self.dropout_dense1 = nn.Dropout(0.2)
#         self.dense1 = nn.Linear(num_features, hidden_size)
        
#         # reshaped hidden_size [input_channel, data] to [input_channel, output_channel]
#         # self.c1 = self.make_layers(self.ch_input,self.ch_output, 5, 2, 2, 0.2)
#         # self.max_pool_c1 = nn.MaxPool1d(kernel_size=2)

#         # self.c2 = self.make_layers(self.ch_output, self.ch_output*2, 3, 2, 1, 0.2)
#         # self.c2_1 = self.make_layers(self.ch_output*2, self.ch_output*2, 3, 1, 1, 0.2)
#         # self.c2_2 = self.make_layers(self.ch_output*2, self.ch_output*2, 3, 1, 1, 0.2)

#         # self.max_pool_c2 = nn.MaxPool1d(kernel_size=2)

#         # self.flatten = nn.Flatten()
        
#         # self.bn_dense2 = nn.BatchNorm1d(64)
#         # self.dropout_dense2 = nn.Dropout(0.2)
#         # self.dense2 = nn.Linear(128,64)

#         # self.bn_dense3 = nn.BatchNorm1d(num_targets)
#         # self.dropout_dense3 = nn.Dropout(0.2)
#         # self.dense3 = nn.Linear(64,num_targets)

#         self.c1 = self.make_layers(self.ch_input,self.ch_output, 5,2,2,0.2)

#         self.c2 = self.make_layers(self.ch_input,self.ch_output, 3,2,1,0.2)
#         self.c2_1 = self.make_layers(self.ch_input,self.ch_output, 3,1,1,0.2)

#     def make_layers(self, ch_in, ch_out, kernel_size, stride, padding, dropout):
#         cnn_module = nn.Sequential(
#                         nn.Conv1d(ch_in,ch_out, kernel_size, stride, padding),
#                         nn.BatchNorm1d(ch_out),
#                         nn.ReLU(),
#                         nn.Dropout(dropout),
#                         nn.Conv1d(ch_out, ch_out, 3, 1, 1),
#                         nn.BatchNorm1d(ch_out)
#                     )
        
#         return cnn_module

#     def forward(self, x):
#         x = self.bn_dense1(x)
#         x = self.dropout_dense1(x)
#         x = self.dense1(x)
        
#         x = x.reshape(x.size(0), self.ch_input, self.points)
#         shortcut = x

#         x = self.c1(x)
#         bat = nn.BatchNorm1d(32)
#         cut = nn.Conv1d(16,32,1,2)

#         s_x = bat(cut(shortcut))
#         print(s_x.shape)
#         print(x.shape)

#         x += s_x
#         print(x.shape)
#         # x = self.c2(x)

#         # x = self.c2_1(x)





#         # x = self.max_pool_c1(x)
#         # print(x.shape)
        
#         # x = self.c2(x)
#         # print(x.shape)
#         # x = self.c2_1(x)
#         # print(x.shape)
#         # x = self.c2_2(x)
#         # print(x.shape)
#         # x = self.max_pool_c2(x)
#         # print(x.shape)

#         # x = self.flatten(x)
#         # print(x.shape)

#         # x = self.dense2(x)
#         # x = self.bn_dense2(x)
#         # x = self.dropout_dense2(x)

#         # x = self.dense3(x)
#         # x = self.bn_dense3(x)
#         # x = self.dropout_dense3(x)
        
#         # x = F.sigmoid(x)

#         return x

In [85]:
class ResidualBlock(nn.Module):
    expansion: int=1
    def __init__(self, inplane, plane, stride=1, dilation=1, dropout=0.2, downsample=None):
        super(ResidualBlock,self).__init__()
        
        self.conv1 = self.conv3x3(inplane, plane, stride, dilation)
        self.bn1 = nn.BatchNorm1d(plane)
        self.dropout1 = nn.Dropout(dropout)
        self.conv2 = self.conv3x3(plane, plane)
        self.bn2 = nn.BatchNorm1d(plane)
        
        # Inplace means in Activation Func
        # https://discuss.pytorch.org/t/whats-the-difference-between-nn-relu-and-nn-relu-inplace-true/948
        self.relu = nn.ReLU(inplace=False)
        
        self.downsample = downsample
        
    def conv3x3(self, in_planes, out_planes, stride=1, dilation=1):
        return nn.Conv1d(in_planes, out_planes, 3, stride, padding=dilation, bias=False)
        
    def forward(self, x):
        shortcut = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout1(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            shortcut = self.downsample(x)
            
        out += shortcut
        out += self.relu(out)
        
        return out

In [94]:
from typing import Type, Any, Callable, Union, List, Optional

class CustomResNet(nn.Module):
    def __init__(self, block: Type[ResidualBlock], layers: List[int], dropout=0.2, num_feature=130, hidden_layers=[512,256], num_classes: int = 5):
        super(CustomResNet, self).__init__()
        self.inplanes = 64
        self.block = block
        self.dropout = dropout
        self.num_feature = num_feature
        self.h1, self.h2 = hidden_layers
        self.num_classes = num_classes
        self.reshaped_dim = int(self.h1 / self.inplanes)

        self.relu = nn.ReLU(inplace=False)
        
        self.bn_d0 = nn.BatchNorm1d(self.num_feature)
        self.dropout_d0 = nn.Dropout(self.dropout)
        
        self.dense1 = nn.Linear(self.num_feature, self.h1)
        self.bn_d1 = nn.BatchNorm1d(self.h1)
        self.dropout_d1 = nn.Dropout(self.dropout)

        self.layer1 = self.make_layers(self.block,64, layers[0], stride=1)
        self.layer2 = self.make_layers(self.block,128,layers[1], stride=2)
        self.layer3 = self.make_layers(self.block,256,layers[2], stride=2)
        self.layer4 = self.make_layers(self.block,512,layers[3], stride=2)
 
        self.avgpool = nn.AvgPool1d(2)
        self.flt = nn.Flatten()

        self.dense2 = nn.Linear(int(self.h1/2), self.h2)
        self.bn_d2 = nn.BatchNorm1d(self.h2)
        self.dropout_d2 = nn.Dropout(self.dropout)
    
        self.dense3 = nn.Linear(self.h2, self.num_classes)
        

    def make_layers(self, block, planes, layer, stride=1):
        downsample = None
        
        if stride > 1:
            downsample = nn.Sequential(
                self.conv1x1(self.inplanes, planes * block.expansion, stride),
                nn.BatchNorm1d(planes * block.expansion),
            )
            
        layers = []
        layers.append(block(self.inplanes, planes, stride, dilation=1,dropout=self.dropout, downsample=downsample))
        self.inplanes = planes * block.expansion

        for i in range(layer-1):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)
            
 
    def conv1x1(self, in_planes, out_planes, stride=1):
        return nn.Conv1d(in_planes, out_planes, 1, stride=stride, bias=False)

    def forward(self, x):
        
        # 130
        x = self.bn_d0(x)
        x = self.dropout_d0(x)
        
        # 1024
        x = self.dense1(x)
        x = self.bn_d1(x)
        x = self.relu(x)
        x = self.dropout_d1(x)
        
        # 1024
        print(x.shape)
        x = x.reshape(x.size(0), 64, self.reshaped_dim)
        # 64, 16
        print(x.shape)
        x = self.layer1(x)
        # 64, 16
        print(x.shape)
        x = self.layer2(x)
        # 128, 8
        print(x.shape)
        x = self.layer3(x)
        # 256, 4
        print(x.shape)
        x = self.layer4(x)
        # 512, 2
        print(x.shape)


        x = self.avgpool(x)
        print(x.shape)
        x = self.flt(x)
        print(x.shape)
        
        x = self.dense2(x)
        x = self.bn_d2(x)
        x = self.relu(x)
        x = self.dropout_d2(x)

        x = self.dense3(x)

        return x
        

In [95]:
block = ResidualBlock
# model = CustomResNet(block=block, layers=[5,5,5,5], hidden_layers=[1024,256])

In [96]:
# model

https://dnddnjs.github.io/cifar10/2018/10/09/resnet/

In [97]:
epochs = 100
batch_size = 4096
learning_rate = 0.001

# model = ResNet(num_layers=[1,3,5], kernel_sizes=[(5,5),(3,3),(3,3)], strides=[2,2,2], paddings=[4,1,1], block=block, num_features=130, hidden_size=512, num_targets=5)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.1)

In [98]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False) 
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [99]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [100]:
class EarlyStopping:
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):

        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score: #  + self.delta
            self.counter += 1
            # print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            # ema.apply_shadow()
            self.save_checkpoint(epoch_score, model, model_path)
            # ema.restore()
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print(f"Validation score improved ({self.val_score:.4f} --> {epoch_score:.4f}). Saving model!")
            # if not DEBUG:
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [102]:
NFOLDS = 5
EARLYSTOP_NUM = 7
CACHE_PATH = model_home

for _fold in range(NFOLDS):
    print(f'Fold{_fold}:')
    seed_torch(seed=entire_seed+_fold)
    torch.cuda.empty_cache()
    model = CustomResNet(block=block, layers=[5,5,5,5], hidden_layers=[1024,256])
    model = model.to(device)

    es = EarlyStopping(EARLYSTOP_NUM, mode="max")
    for epoch in tqdm_notebook(range(epochs)):

        running_loss = 0.0
        running_acc = 0.0
        running_auc = 0.0
        model.train()
        
        for idx, (inputs, labels) in enumerate(train_dataloader):
        
            optimizer.zero_grad()

            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            
            true = labels.detach().cpu().numpy()[:,-1]
            target = np.array(list(map(lambda x: 1 if x > 0.5 else 0, outputs.detach().cpu().numpy()[:,-1])),dtype=np.float)
            
            acc = (true == target).sum() / outputs.shape[0]
            auc = roc_auc_score(true, outputs.detach().cpu().numpy()[:,-1])
    
            running_acc += acc
            running_auc += auc

            loss = criterion(outputs,labels)
            print(outputs)
            print(loss)
            running_loss += loss.detach().item() * inputs.size(0)
            loss.backward()
            optimizer.step()
            
        epoch_loss = running_loss / len(train_dataloader.dataset)
        epoch_acc = running_acc / len(train_dataloader)
        epoch_auc = running_auc / len(train_dataloader)

        with torch.no_grad():
            model.eval()
            running_loss = 0.0
            running_acc = 0.0
            running_auc = 0.0
            for idx, (inputs, labels) in enumerate(valid_dataloader):

                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)

                true = labels.detach().cpu().numpy()[:,-1]
                target = np.array(list(map(lambda x: 1 if x > 0.5 else 0, outputs.detach().cpu().numpy()[:,-1])),dtype=np.float)
                
                acc = (true == target).sum() / outputs.shape[0]
                auc = roc_auc_score(true, outputs.detach().cpu().numpy()[:,-1])

                running_acc += acc
                running_auc += auc

                loss = criterion(outputs, labels)
                running_loss += loss.detach().item() * inputs.size(0)
                
            valid_loss = running_loss / len(valid_dataloader.dataset)
            valid_acc = running_acc / len(valid_dataloader)
            valid_auc = running_auc / len(valid_dataloader)

        print(f"EPOCH:{epoch+1}|{epochs}; loss(train/valid):{epoch_loss:.4f}/{valid_loss:.4f}; acc(train/valid):{epoch_acc:.4f}/{valid_acc:.4f}; auc(train/valid):{epoch_auc:.4f}/{valid_auc:.4f}")
        
        model_weights = os.path.join(model_home,f"online_model_{_fold}.pth")
        es(valid_auc, model, model_path=model_weights)
        if es.early_stop:
          print("Early stopping")
          break

Fold0:


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

torch.Size([4096, 1024])
torch.Size([4096, 64, 16])
torch.Size([4096, 64, 16])
torch.Size([4096, 128, 8])
torch.Size([4096, 256, 4])
torch.Size([4096, 512, 2])
torch.Size([4096, 512, 1])
torch.Size([4096, 512])
tensor([[-0.1138, -0.0725, -0.4713, -0.3764, -0.3333],
        [ 0.0156, -0.3070, -0.3076, -0.1792, -0.2712],
        [-0.3998,  0.2552, -0.2037, -0.1469,  0.3872],
        ...,
        [-0.8133,  0.6118, -0.6493,  0.3717, -0.0791],
        [-0.4607,  0.1901, -0.1577, -0.0470, -0.3716],
        [-0.2299, -0.1175, -0.2071, -0.2636,  0.3940]], device='cuda:0',
       grad_fn=<AddmmBackward>)
tensor(0.7205, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


RuntimeError: ignored

In [None]:
from tqdm import tqdm_notebook

import janestreet
env = janestreet.make_env()

# learn.model.eval()
preds = []
for (test_df, pred_df) in tqdm_notebook(env.iter_test()):
    if test_df['weight'].item() > 0:
        test_np = test_df.loc[:, features].values
        test_np[:, 1:] = for_loop(fillna_npwhere_njit, test_np[:, 1:], f_mean)
        pred = torch.mean(model(torch.tensor(test_np, dtype=torch.float).cuda(device))).item()
        preds.append(pred)
        action = 1 if pred >= .5 else 0
        pred_df.action = action
    else:
        pred_df.action = 0
    env.predict(pred_df)

In [None]:
preds = np.array(preds)
preds.mean(), preds.std(), sum(preds >= .5), sum(preds < 5)