In [2]:
import warnings
import os 

from collections import defaultdict
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

import networkx as nx
import scipy.cluster.hierarchy as sch
from scipy.cluster.hierarchy import fcluster

import random

from tqdm import tqdm_notebook

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F

# install datatable
!pip install datatable
import datatable as dt

from numba import njit

import gc

warnings.simplefilter(action="ignore")

project_home = "/kaggle/input/jane-street-market-prediction"
data_home = project_home

# project_home = "/gdrive/MyDrive/colab/jane-street-market-prediction"
# data_home = os.path.join(project_home, "input/data")
# model_home = os.path.join(project_home, "output/model")



In [3]:
entire_seed = 1029

def seed_torch(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
#     torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.deterministic = False
    
seed_torch(entire_seed)

In [4]:
train_file = os.path.join(data_home,'train.csv')
features_file = os.path.join(data_home,'features.csv')
example_test_file = os.path.join(data_home,'example_test.csv')
example_sample_submission_file = os.path.join(data_home,'example_sample_submission.csv')

train_data_datatable = dt.fread(train_file)

df_train = train_data_datatable.to_pandas()
df_features = pd.read_csv(features_file)
df_example_test = pd.read_csv(example_test_file)
df_example_sample_submission = pd.read_csv(example_sample_submission_file)

In [5]:
features = [ col for col in df_train.columns if "feature" in col ]
resps = [ col for col in df_train.columns if "resp" in col ]
target_resp = [resp_ for resp_ in resps if "_" not in resp_]
target = ["weight"] + target_resp + features 

In [6]:
"""
Reduce Memory Usage by 75%
https://www.kaggle.com/tomwarrens/nan-values-depending-on-time-of-day
"""

## Reduce Memory

def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
#                 reducing float16 for calculating numpy.nanmean
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

df_train = reduce_memory_usage(df_train)
df_train.info()

Memory usage of dataframe is 2489.4869804382324 MB
Memory usage of dataframe after reduction 1247.0233011245728 MB
Reduced by 49.908422461199 % 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2390491 entries, 0 to 2390490
Columns: 138 entries, date to ts_id
dtypes: float32(135), int16(1), int32(1), int8(1)
memory usage: 1.2 GB


In [7]:
# drop before 85days
df_train = df_train.loc[df_train.date>85]
# drop weight 0 for training
df_train = df_train.loc[df_train.weight > 0]

# df_labels = df_train[['date','weight','resp_1','resp_2','resp_3','resp_4','resp']]

# df_train = df_train.drop(df_labels.columns,axis=1)

In [8]:
"""
The codes from 'Optimise Speed of Filling-NaN Function'
https://www.kaggle.com/gogo827jz/optimise-speed-of-filling-nan-function
"""

def for_loop(method, matrix, values):
    for i in range(matrix.shape[0]):
        matrix[i] = method(matrix[i], values)
    return matrix

def for_loop_ffill(method, matrix):
    tmp = np.zeros(matrix.shape[1],dtype=np.float32)
    for i in range(matrix.shape[0]):
        matrix[i] = method(matrix[i], tmp)
        tmp = matrix[i]
    return matrix

@njit
def fillna_npwhere_njit(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [9]:
# converting numpy for efficient calcualtion.
# ft 1~129
np_ft_train = df_train.loc[:,features[1:]].values
np_ft_train.shape

# ft 0
# np_train_ft0 = df_train.loc[:,features[0]].values

(1571415, 129)

In [10]:
f_mean = np.nanmean(np_ft_train,axis=0)

In [12]:
np_train = df_train.values

In [13]:
print('fillna_npwhere_njit (mean-filling):')
np_train[:,8:-1] = for_loop(fillna_npwhere_njit, np_train[:,8:-1], f_mean)

fillna_npwhere_njit (mean-filling):


Embedding-NN

https://www.kaggle.com/sapthrishi007/pytorch-embeddingsnn-resnet-tensorflow


In [15]:
N_FEAT_TAGS = 29
device = "cuda:0" if torch.cuda.is_availables() else "cpu"
N_FEATURES = [130]
hidden_layers = [400,400,400]

class FFN(nn.Module):
    def __init__(self, num_features, num_classes, hidden_layers, dropout, f_act=nn.SiLU(), is_op_act =False):
        super(FFN,self).__init__()
        
        self.f_act = f_act
        self.dropout = nn.Dropout(dropout)
        self.bn_d0 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, hidden_layers[0])
        self.bn_d1 = nn.BatchNorm1d(hidden_layers[0])
        self.dense2 = nn.Linear(hidden_layers[0], hidden_layers[1])
        self.bn_d2 = nn.BatchNorm1d(hidden_layers[1])
        self.dense3 = nn.Linear(hidden_layers[1], hidden_layers[2])
        self.bn_d3 = nn.BatchNorm1d(hidden_layers[2])
        self.dense4 = None
        
        if num_classes > 0:
            self.dense4 = nn.Linear(hidden_layers[2], num_classes)
            
        self.out_activ = None
        
        if is_op_act:
            if num_classes == 1 or num_classes == 2
                self.out_active = nn.Sigmoid():
            elif num_classes > 2:
                self.out_active = nn.Softmax(dim=-1):
        
    def forward(self, x):
        
        x = self.bn_d0(x)
        x = self.dropout(self.f_act(self.bn_d1(self.dense1(x))))
        x = self.dropout(self.f_act(self.bn_d2(self.dense2(x))))
        x = self.dropout(self.f_act(self.bn_d3(self.dense3(x))))
        if self.dense4:
            x = self.dense4(x)
        if self.out_active:
            x = self.out_active(x)
            
        return x
    
class Emb_NN_Model(nn.Module):
    def __init__(self, hidden_layers, embed_dim, df_features):
        super(Emb_NN_Model,self).__init__()
        
        N_FEAT_TAGS = 29
        
        
        

Unnamed: 0,feature,tag_0,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8,...,tag_19,tag_20,tag_21,tag_22,tag_23,tag_24,tag_25,tag_26,tag_27,tag_28
0,feature_0,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,feature_1,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
2,feature_2,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
3,feature_3,False,False,False,False,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False
4,feature_4,False,False,False,False,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False


In [27]:
df_features["tag_29"] = np.array([1]+[0] * (df_features.shape[0]-1))

In [29]:
df_features.loc[:,df_features.columns[1:]].astype("int8")

ValueError: invalid literal for int() with base 10: 'feature_0'