# NN Model

In [1]:
import glob, re
import numpy as np
import pandas as pd
from datetime import datetime

from keras.layers import Embedding, Input, Dense
from keras.models import Model
import keras

import keras.backend as K

Using TensorFlow backend.


In [2]:
nn_train = np.load('processed_input/train_features_nn.npz')
nn_test = np.load('processed_input/test_features_nn.npz')

train = pd.read_csv('processed_input/train_attr_nn.csv.gz')
test = pd.read_csv('processed_input/test_attr_nn.csv.gz')

Following function implements the Keras neural network model.

Basic structure:
- categorical columns get independent inputs, passed through embedding layer and then flattened.
- numeric columns are simply taken as float32 inputs
- the final tensors of categorical and numerical are then concatenated together
- following the concatenated layer and simple feed forward neural network is implemented.
- output layer has 'ReLU' activation function

## Embedding for Categorical Features

*Key sentence:* Embeddings **capture richer relationships and complexities than the raw categories.**

*A key technique to making the most of deep learning for tabular data is to use embeddings for your categorical variables. This approach allows for relationships between categories to be captured. Perhaps Saturday and Sunday have similar behavior, and maybe Friday behaves like an average of a weekend and a weekday. Similarly, for zip codes, there may be patterns for zip codes that are geographically near each other, and for zip codes that are of similar socio-economic status.*

## Concatenation of Layers

We can concatenate different sub-networks. In this example, we concatenate embedding layers.

## Leaky ReLU (Activation Function)

ReLU             |  Leaky ReLU
:-------------------------:|:-------------------------:
 ![](appendix/relu.png) | ![](appendix/leaky_relu.png) 
 *Source: https://sefiks.com/wp-content/uploads/2017/08/relu.png* | *Source: https://sefiks.com/wp-content/uploads/2018/02/leaky-relu.png*

## Batch Normalization

*We normalize the input layer by adjusting and scaling the activations. For example, when we have features from 0 to 1 and some from 1 to 1000, we should normalize them to speed up learning. If the input layer is benefiting from it, why not do the same thing also for the values in the hidden layers, that are changing all the time, and get 10 times or more improvement in the training speed.*

## RMSProp

*Gradient descent is one of the most popular algorithms to perform optimization and by far the most common way to optimize neural networks.*

![](appendix/rmsprop.gif)

*Source: http://ruder.io/content/images/2016/09/saddle_point_evaluation_optimizers.gif*

## Our Model

![](appendix/nn_model.png)

In [3]:
def get_nn_complete_model(train, hidden1_neurons=35, hidden2_neurons=15):
    K.clear_session()
    
    # Categorical Embeddings
    air_store_id = Input(shape=(1,), dtype='int32', name='air_store_id')
    air_store_id_emb = Embedding(len(train['air_store_id2'].unique()) + 1, 15, input_shape=(1,),
                                 name='air_store_id_emb')(air_store_id)
    air_store_id_emb = keras.layers.Flatten(name='air_store_id_emb_flatten')(air_store_id_emb)

    dow = Input(shape=(1,), dtype='int32', name='dow')
    dow_emb = Embedding(8, 3, input_shape=(1,), name='dow_emb')(dow)
    dow_emb = keras.layers.Flatten(name='dow_emb_flatten')(dow_emb)

    month = Input(shape=(1,), dtype='int32', name='month')
    month_emb = Embedding(13, 3, input_shape=(1,), name='month_emb')(month)
    month_emb = keras.layers.Flatten(name='month_emb_flatten')(month_emb)

    air_area_name, air_genre_name = [], []
    air_area_name_emb, air_genre_name_emb = [], []
    for i in range(7):
        area_name_col = 'air_area_name' + str(i)
        air_area_name.append(Input(shape=(1,), dtype='int32', name=area_name_col))
        tmp = Embedding(len(train[area_name_col].unique()), 3, input_shape=(1,),
                        name=area_name_col + '_emb')(air_area_name[-1])
        tmp = keras.layers.Flatten(name=area_name_col + '_emb_flatten')(tmp)
        air_area_name_emb.append(tmp)

        if i > 4:
            continue
        area_genre_col = 'air_genre_name' + str(i)
        air_genre_name.append(Input(shape=(1,), dtype='int32', name=area_genre_col))
        tmp = Embedding(len(train[area_genre_col].unique()), 3, input_shape=(1,),
                        name=area_genre_col + '_emb')(air_genre_name[-1])
        tmp = keras.layers.Flatten(name=area_genre_col + '_emb_flatten')(tmp)
        air_genre_name_emb.append(tmp)

    air_genre_name_emb = keras.layers.concatenate(air_genre_name_emb)
    air_genre_name_emb = Dense(4, activation='sigmoid', name='final_air_genre_emb')(air_genre_name_emb)

    air_area_name_emb = keras.layers.concatenate(air_area_name_emb)
    air_area_name_emb = Dense(4, activation='sigmoid', name='final_air_area_emb')(air_area_name_emb)
    
    air_area_code = Input(shape=(1,), dtype='int32', name='air_area_code')
    air_area_code_emb = Embedding(len(train['air_area_name'].unique()), 8, input_shape=(1,), name='air_area_code_emb')(air_area_code)
    air_area_code_emb = keras.layers.Flatten(name='air_area_code_emb_flatten')(air_area_code_emb)
    
    air_genre_code = Input(shape=(1,), dtype='int32', name='air_genre_code')
    air_genre_code_emb = Embedding(len(train['air_genre_name'].unique()), 5, input_shape=(1,),
                                   name='air_genre_code_emb')(air_genre_code)
    air_genre_code_emb = keras.layers.Flatten(name='air_genre_code_emb_flatten')(air_genre_code_emb)

    # Float Attributes
    holiday_flg = Input(shape=(1,), dtype='float32', name='holiday_flg')
    year = Input(shape=(1,), dtype='float32', name='year')
    min_visitors = Input(shape=(1,), dtype='float32', name='min_visitors')
    mean_visitors = Input(shape=(1,), dtype='float32', name='mean_visitors')
    median_visitors = Input(shape=(1,), dtype='float32', name='median_visitors')
    max_visitors = Input(shape=(1,), dtype='float32', name='max_visitors')
    count_observations = Input(shape=(1,), dtype='float32', name='count_observations')
    rs1_x = Input(shape=(1,), dtype='float32', name='rs1_x')
    rv1_x = Input(shape=(1,), dtype='float32', name='rv1_x')
    rs2_x = Input(shape=(1,), dtype='float32', name='rs2_x')
    rv2_x = Input(shape=(1,), dtype='float32', name='rv2_x')
    rs1_y = Input(shape=(1,), dtype='float32', name='rs1_y')
    rv1_y = Input(shape=(1,), dtype='float32', name='rv1_y')
    rs2_y = Input(shape=(1,), dtype='float32', name='rs2_y')
    rv2_y = Input(shape=(1,), dtype='float32', name='rv2_y')
    total_reserv_sum = Input(shape=(1,), dtype='float32', name='total_reserv_sum')
    total_reserv_mean = Input(shape=(1,), dtype='float32', name='total_reserv_mean')
    total_reserv_dt_diff_mean = Input(shape=(1,), dtype='float32', name='total_reserv_dt_diff_mean')
    date_int = Input(shape=(1,), dtype='float32', name='date_int')
    var_max_lat = Input(shape=(1,), dtype='float32', name='var_max_lat')
    var_max_long = Input(shape=(1,), dtype='float32', name='var_max_long')
    lon_plus_lat = Input(shape=(1,), dtype='float32', name='lon_plus_lat')
    
    # Date attributes Embedding
    date_emb = keras.layers.concatenate([dow_emb, month_emb, year, holiday_flg])
    date_emb = Dense(5, activation='sigmoid', name='date_merged_emb')(date_emb)

    # Concatenate All Layers
    cat_layer = keras.layers.concatenate([holiday_flg, min_visitors, mean_visitors,
                    median_visitors, max_visitors, count_observations, rs1_x, rv1_x,
                    rs2_x, rv2_x, rs1_y, rv1_y, rs2_y, rv2_y,
                    total_reserv_sum, total_reserv_mean, total_reserv_dt_diff_mean,
                    date_int, var_max_lat, var_max_long, lon_plus_lat,
                    date_emb, air_area_name_emb, air_genre_name_emb,
                    air_area_code_emb, air_genre_code_emb, air_store_id_emb])

    # Top Layer
    m = Dense(hidden1_neurons, name='hidden1',
             kernel_initializer=keras.initializers.RandomNormal(mean=0.0,
                            stddev=0.05, seed=None))(cat_layer)
    m = keras.layers.LeakyReLU(alpha=0.2)(m)
    m = keras.layers.BatchNormalization()(m)
    
    m1 = Dense(hidden2_neurons, name='hidden2')(m)
    m1 = keras.layers.LeakyReLU(alpha=0.2)(m1)
    m = Dense(1, activation='relu')(m1)

    # Input Tensor
    inp_ten = [
        holiday_flg, min_visitors, mean_visitors, median_visitors, max_visitors, count_observations,
        rs1_x, rv1_x, rs2_x, rv2_x, rs1_y, rv1_y, rs2_y, rv2_y, total_reserv_sum, total_reserv_mean,
        total_reserv_dt_diff_mean, date_int, var_max_lat, var_max_long, lon_plus_lat,
        dow, year, month, air_store_id, air_area_code, air_genre_code
    ]
    inp_ten += air_area_name
    inp_ten += air_genre_name
    
    # Construct NN Model
    model = Model(inp_ten, m)
    model.compile(loss='mse', optimizer='rmsprop', metrics=['acc'])

    return model

In [4]:
model = get_nn_complete_model(train, hidden1_neurons=45)

for i in range(5):
    model.fit(
        nn_train['X_train'].tolist(), 
        nn_train['Y_train'], 
        epochs=8, 
        verbose=0, 
        batch_size=512, 
        shuffle=True
    )
    model.fit(
        nn_train['X_train'].tolist(), 
        nn_train['Y_train'], 
        epochs=3, 
        verbose=1, 
        batch_size=512, 
        shuffle=True, 
        validation_split=0.15
    )
    
model.fit(
    nn_train['X_train'].tolist(),
    nn_train['Y_train'], 
    epochs=4, 
    verbose=0, 
    batch_size=512, 
    shuffle=True
)
print("Model trained")

preds = pd.Series(model.predict(nn_test['X_test'].tolist()).reshape(-1)).clip(0, 6.8).values

test['visitors'] = preds
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)
sub1 = test[['id','visitors']].copy()
sub1['preds'] = pd.Series(preds)
print("Model predictions done.")

Train on 212897 samples, validate on 37571 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 212897 samples, validate on 37571 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 212897 samples, validate on 37571 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 212897 samples, validate on 37571 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train on 212897 samples, validate on 37571 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Model trained
Model predictions done.


In [5]:
# from hklee
# https://www.kaggle.com/zeemeen/weighted-mean-comparisons-lb-0-497-1st/code
dfs = { re.search('/([^/\.]*)\.csv.gz', fn).group(1):
    pd.read_csv(fn)for fn in glob.glob('../input/*.csv.gz')}

for k, v in dfs.items(): locals()[k] = v
    
date_info = pd.read_csv('processed_input/date_info_nn.csv.gz')
air_visit_data = pd.read_csv('processed_input/train_nn.csv.gz')
sample_submission = pd.read_csv('processed_input/test_nn.csv.gz')
    
wkend_holidays = date_info.apply(
    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
date_info.loc[wkend_holidays, 'holiday_flg'] = 0
date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  

visit_data = air_visit_data.merge(date_info, left_on='visit_date', right_on='visit_date', how='left')
visit_data.drop('visit_date', axis=1, inplace=True)
visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

wmean = lambda x:( (x.weight * x.visitors).sum() / x.weight.sum() )
visitors = visit_data.groupby(['air_store_id', 'day_of_week', 'holiday_flg']).apply(wmean).reset_index()
visitors.rename(columns={0:'visitors'}, inplace=True) # cumbersome, should be better ways.

sample_submission['air_store_id'] = sample_submission.id.map(lambda x: '_'.join(x.split('_')[:-1]))
sample_submission['calendar_date'] = sample_submission.id.map(lambda x: x.split('_')[2])
sample_submission.drop('visitors', axis=1, inplace=True)
sample_submission = sample_submission.merge(date_info, on='visit_date', how='left')
sample_submission = sample_submission.merge(visitors, on=[
    'air_store_id', 'day_of_week', 'holiday_flg'], how='left')

missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[visitors.holiday_flg==0], on=('air_store_id', 'day_of_week'), 
    how='left')['visitors_y'].values

sample_submission['visitors'] = sample_submission.visitors.map(pd.np.expm1)
sub2 = sample_submission[['id', 'visitors']].copy()
sub2 = sub2.fillna(-1) # for the unfound values

## Preparing Submission & Submiting with Kaggle API

In [6]:
def final_visitors(x, alt=False):
    visitors_x, visitors_y = x['visitors_x'], x['visitors_y']
    if x['visitors_y'] == -1:
        return visitors_x
    else:
        return 0.7*visitors_x + 0.3*visitors_y* 1.3

sub_merge = pd.merge(sub1, sub2, on='id', how='inner')
sub_merge['visitors'] = sub_merge.apply(lambda x: final_visitors(x), axis=1)
print("Done")
sub_merge[['id', 'visitors']].to_csv('submissions/nn_submission.csv.gz', compression='gzip', index=False)

Done


In [7]:
# !kaggle competitions submit -c recruit-restaurant-visitor-forecasting -f submissions/nn_submission.csv.gz -m "Final NN" 

## Saving Model Weights

In [8]:
model.save_weights('nn_model_weights.h5')

## References
- https://www.fast.ai/2018/04/29/categorical-embeddings/
- https://towardsdatascience.com/activation-functions-neural-networks-1cbd9f8d91d6
- https://towardsdatascience.com/batch-normalization-in-neural-networks-1ac91516821c
- http://ruder.io/optimizing-gradient-descent/