In [9]:
# Import and setup
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
import matplotlib.pyplot as plt
import gc
%matplotlib inline

In [10]:
# For debug: load processed data from saved file directly
df_train = pd.read_csv('df_train.csv')
df_train_meta = pd.read_csv('df_train_meta.csv')
df_train_weight = pd.read_csv('df_train_weight.csv')
weight_train = df_train_weight['weight_train'].values
df_train_weight = None

In [11]:
# Get X and y
y_train = df_train['winPlacePerc'].values
X_train = df_train.drop(columns='winPlacePerc').values

feature_name = df_train.columns
df_train = None

print(X_train.shape)
gc.collect()

(2026744, 548)


25

In [12]:
# Standardize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [19]:
# Load models
from joblib import load
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
from keras.models import load_model

lr = load('LR.joblib')
lg = lgb.Booster(model_file='LightGBM_Model.txt')
nn = load_model('NN_Model.h5')

models = [lr, lg, nn]

In [16]:
# define method to build new dataset for stacking-level model
def build_stacking_dataset(models, X, df_meta, is_train=True):
    # Load and preprocess original training set
    df = None
    if is_train:
        df = pd.read_csv('train_V2.csv')
    else:
        df = pd.read_csv('test_V2.csv')
    df = df.dropna()
    # Fill out None values
    mean_rankpoints = 1494.34089
    mean_killpoints = 1253.6821744
    mean_winpoints = 1505.542888
    df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = mean_rankpoints
    df.loc[df['killPoints'] < 1e-4, 'killPoints'] = mean_killpoints
    df.loc[df['winPoints'] < 1e-4, 'winPoints'] = mean_winpoints
    
    # Add predictions to original training set
    for idx, model in enumerate(models):
        df_intermediate = df_meta.copy()
        df_intermediate['pred_' + str(idx)] = model.predict(X)
        df = df.merge(df_intermediate, on=['matchId', 'groupId'], how='left')
    
    if is_train:
        y = df['winPlacePerc'].values
        df = df.drop(columns=['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc'])
        return df.values, df.columns, y
    else:
        ids = df['Id']
        df = df.drop(columns=['Id', 'groupId', 'matchId', 'matchType'])
        return df.values, df.columns, ids

In [17]:
# Build stacking dataset
X_train, feature_name, y_train = build_stacking_dataset(models, X_train, df_train_meta, is_train=True)

In [20]:
# Prepare validation set
ratio_valid = 0.05

idx_shuffle = np.arange(X_train.shape[0])
np.random.shuffle(idx_shuffle)
idx_split = int(X_train.shape[0] * ratio_valid)
idx_valid = idx_shuffle[:idx_split]
idx_train = idx_shuffle[idx_split:]

X_valid = X_train[idx_valid]
y_valid = y_train[idx_valid]
lgb_data_valid = lgb.Dataset(X_valid, label=y_valid, free_raw_data=True)

X_train = X_train[idx_train]
y_train = y_train[idx_train]
lgb_data_train = lgb.Dataset(X_train, label=y_train, free_raw_data=True)

gc.collect()

10671

In [21]:
# Train LightGBM on stacking dataset

# Define model parameters
lgb_params = {"objective" : "regression", "metric" : "mae", 'n_estimators':1000, 'early_stopping_rounds':50,
              "num_leaves" : 31, "learning_rate" : 0.1, "bagging_fraction" : 0.7,
               "bagging_seed" : 0, "num_threads" : 4,"colsample_bytree" : 0.7
             }

# Train model
model = lgb.train(lgb_params, lgb_data_train,
                  valid_sets=[lgb_data_train, lgb_data_valid],
                  verbose_eval=200)



Training until validation scores don't improve for 50 rounds.
[200]	training's l1: 0.016868	valid_1's l1: 0.0169399
[400]	training's l1: 0.0166869	valid_1's l1: 0.0167757
[600]	training's l1: 0.0165951	valid_1's l1: 0.0167043
[800]	training's l1: 0.0165285	valid_1's l1: 0.0166591
[1000]	training's l1: 0.0164775	valid_1's l1: 0.0166294
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.0164775	valid_1's l1: 0.0166294


In [22]:
# Save model
model.save_model('Stacking_Model.txt')

<lightgbm.basic.Booster at 0x7f72d8263e48>