In [1]:
# Import and setup
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
import matplotlib.pyplot as plt
import gc
%matplotlib inline

In [2]:
filename_output = 'tune_lg.output'
with open(filename_output, 'a') as f:
    f.write('%s starts\n' % ('PUBG_LightGBM_Tune'))

In [3]:
# For debug: load processed data from saved file directly
df_train = pd.read_csv('df_train.csv')
df_train_meta = pd.read_csv('df_train_meta.csv')
df_train_weight = pd.read_csv('df_train_weight.csv')
weight_train = df_train_weight['weight_train'].values
df_train_weight = None

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# df_train = reduce_mem_usage(df_train)

In [5]:
# Get X and y
y_train = df_train['winPlacePerc'].values
X_train = df_train.drop(columns='winPlacePerc').values

feature_name = df_train.columns
df_train = None

print(X_train.shape)
gc.collect()

(2026744, 548)


11

In [6]:
# Define method to search parameters by holdout
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

def search_lg_params(X, y, weight, params, l_num_leaves, l_lr, l_bagging_fraction,
                     holdout_itr=1, holdout_ratio=0.2):
    best_mae_valid = None
    best_num_leaves = None
    best_lr = None
    best_bagging_fraction = None
    
    for num_leaves in l_num_leaves:
        for lr in l_lr:
            for bagging_fraction in l_bagging_fraction:
                maes = []
                ss = ShuffleSplit(n_splits=holdout_itr, test_size=holdout_ratio)
                for idx_train, idx_valid in ss.split(X):
                    X_train = X[idx_train]
                    y_train = y[idx_train]
                    X_valid = X[idx_valid]
                    y_valid = y[idx_valid]
                    weight_train = weight[idx_train]
                    weight_valid = weight[idx_valid]
                    lgb_data_train = lgb.Dataset(X_train, label=y_train, weight=weight_train, free_raw_data=True)
                    
                    params['num_leaves'] = num_leaves
                    params['learning_rate'] = lr
                    params['bagging_fraction'] = bagging_fraction
                    
                    model = lgb.train(params, lgb_data_train)
                    maes.append(mean_absolute_error(y_valid, model.predict(X_valid), sample_weight=weight_valid))
                
                mae = np.array(maes).mean()
                
                with open(filename_output, 'a') as f:
                    f.write('num_leaves = %d, learning_rate = %.3f, bagging_fraction = %.2f, MAE = %.4f\n'
                            % (num_leaves, lr, bagging_fraction, mae))
                
                if best_mae_valid is None or mae < best_mae_valid:
                    best_mae_valid = mae
                    best_num_leaves = num_leaves
                    best_lr = lr
                    best_bagging_fraction = bagging_fraction
    
    return best_num_leaves, best_lr, best_bagging_fraction, best_mae_valid

In [None]:
# Search params
l_num_leaves = [31, 40]
l_lr = [0.05, 0.1]
l_bagging_fraction = [0.7, 0.9]

base_params = {"objective" : "regression", "metric" : "mae", 'n_estimators':2000,
               "bagging_seed" : 0, "num_threads" : 4,"colsample_bytree" : 0.7
         }

best_params = search_lg_params(X_train, y_train, weight_train, base_params,
                              l_num_leaves, l_lr, l_bagging_fraction,
                               holdout_itr=1, holdout_ratio=0.2)



In [None]:
with open(filename_output, 'a') as f:
    f.write('best parameters: num_leaves = %d, learning_rate = %.3f, bagging_fraction = %.2f, MAE = %.4f\n'
                            % (best_params[0], best_params[1], best_params[2], best_params[3]))