In [1]:
# Import and setup
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
import matplotlib.pyplot as plt
import gc

In [2]:
filename_output = 'tune_nn.output'
with open(filename_output, 'a') as f:
    f.write('%s starts\n' % ('PUBG_NN_Tune'))

In [3]:
# For debug: load processed data from saved file directly
df_train = pd.read_csv('df_train.csv')
df_train_meta = pd.read_csv('df_train_meta.csv')
df_train_weight = pd.read_csv('df_train_weight.csv')
weight_train = df_train_weight['weight_train'].values
df_train_weight = None

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# df_train = reduce_mem_usage(df_train)

In [5]:
# Get X and y
y_train = df_train['winPlacePerc'].values
X_train = df_train.drop(columns='winPlacePerc').values

feature_name = df_train.columns
df_train = None

print(X_train.shape)
gc.collect()

(2026744, 548)


7

In [6]:
# Standardize
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [7]:
# Define method to build model
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.callbacks import EarlyStopping, LearningRateScheduler
from keras.regularizers import l2

def build_model(input_dim, hidden_layers, reg_strength, dropout_rate, lr):
    model = Sequential()
    
    for idx, size in enumerate(hidden_layers):
        regularizer = None if reg_strength < 1e-6 else l2(reg_strength)
        if idx == 0:
            model.add(Dense(size, activation='relu', kernel_regularizer=regularizer, input_dim=input_dim))
        else:
            model.add(Dense(size, activation='relu', kernel_regularizer=regularizer))
        if dropout_rate > 1e-6: model.add(Dropout(dropout_rate))
            
    model.add(Dense(1, activation='linear'))

    optimizer = optimizers.Adam(lr)
    model.compile(optimizer, loss='mse', weighted_metrics=['mae'])
    
    return model

Using TensorFlow backend.


In [8]:
# Define method to fit model with callbacks
def fit_model(model, X, y, weight, epochs=100, early_stop_patience=None, lr=None, decay_factor=None, step_size=None):
    callbacks = []
    if early_stop_patience is not None:
        callbacks.append(EarlyStopping(monitor='val_weighted_mean_absolute_error', min_delta=1e-5,
                                       patience=early_stop_patience, restore_best_weights=True))
    if decay_factor is not None and step_size is not None:
        callbacks.append(LearningRateScheduler(
            lambda epoch, curr_lr: lr * (decay_factor ** np.floor(epoch/step_size)),
            verbose=0))
    
    return model.fit(X, y, sample_weight=weight, callbacks=callbacks,
                     batch_size=20000, epochs=epochs, verbose=0)

In [9]:
# Define method to search parameters by holdout
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_absolute_error

def search_nn_params(X, y, weight, l_hidden_layers, l_reg_strength,
                     l_dropout_rate, l_lr, epochs, holdout_itr=1, holdout_ratio=0.2):
    best_mae_valid = None
    best_hidden_layers = None
    best_reg_strength = None
    best_dropout_rate = None
    best_lr = None
    
    for hidden_layers in l_hidden_layers:
        for reg_strength in l_reg_strength:
            for dropout_rate in l_dropout_rate:
                for lr in l_lr:
                    maes = []
                    ss = ShuffleSplit(n_splits=holdout_itr, test_size=holdout_ratio)
                    for idx_train, idx_valid in ss.split(X):
                        X_train = X[idx_train]
                        y_train = y[idx_train]
                        X_valid = X[idx_valid]
                        y_valid = y[idx_valid]
                        weight_train = weight[idx_train]
                        weight_valid = weight[idx_valid]
                        
                        model = build_model(X_train.shape[1], hidden_layers, reg_strength, dropout_rate, lr)
                        fit_model(model, X_train, y_train, weight_train, epochs=epochs,
                                  early_stop_patience=None, lr=lr, decay_factor=0.7, step_size=10)
                        
                        maes.append(mean_absolute_error(y_valid, model.predict(X_valid), sample_weight=weight_valid))
                    
                    mae = np.array(maes).mean()
                    
                    with open(filename_output, 'a') as f:
                        f.write('hidden_layers = %s, reg_strength = %.4f, dropout_rate = %.3f, lr = %.4f, MAE = %.4f\n'
                                % (hidden_layers, reg_strength, dropout_rate, lr, mae))
                    print('hidden_layers = %s, reg_strength = %.3f, dropout_rate = %.3f, lr = %.4f, MAE = %.4f'
                         % (hidden_layers, reg_strength, dropout_rate, lr, mae))
                    
                    if best_mae_valid is None or mae < best_mae_valid:
                        best_mae_valid = mae
                        best_hidden_layers = hidden_layers
                        best_reg_strength = reg_strength
                        best_dropout_rate = dropout_rate
                        best_lr = lr

    return best_hidden_layers, best_reg_strength, best_dropout_rate, best_lr, best_mae_valid

In [None]:
# Search params
l_hidden_layers = [[512, 256, 128, 64, 32], [512, 256, 128, 64], [256, 128, 64, 32], [512, 256, 128]]
l_reg_strength = [0, 0.0005, 0.001]
l_dropout_rate = [0, 0.05]
l_lr = [0.001]

params = search_nn_params(X_train, y_train, weight_train, l_hidden_layers, l_reg_strength,
                          l_dropout_rate, l_lr, epochs=20, holdout_itr=1, holdout_ratio=0.2)

hidden_layers = [512, 256, 128, 64, 32], reg_strength = 0.000, dropout_rate = 0.000, lr = 0.0010, MAE = 0.0305


In [None]:
with open(filename_output, 'a') as f:
    f.write('best parameters: hidden_layers = %s, reg_strength = %.4f, dropout_rate = %.3f, lr = %.4f, MAE = %.4f\n'
            % (params[0], params[1], params[2], params[3], params[4]))