## PUBG Modelling

플레이어들이 전장에서 벌이는 모든 행위를 분류하여 승리를 예측하기 위한 프로젝트

## Column 별 정보
- ASSISTS : 자신에 의해 사살되지는 못했지만 자신의 데미지 지분율이 가장 높은 적의 수
- BOOSTS : 도핑 아이템 사용 횟수(에너지 드링크, 진통제, 아드레날린 주사기)
- DAMAGEDEALT : 적에게 가한 전체 데미지
- DBNOS : 기절시킨 적의 수
- GROUPID : 경기 내 그룹 식별 ID
- HEADSHOTKILLS : 헤드샷으로 죽인 적의 수
- HEALS : 회복 아이템 사용 횟수(붕대, 구급상자, 의료용 키트)
- ID : 해당 데이터 ID
- KILLPLACE : 경기 내 적을 죽인 적 수의 순위
- KILLPOINTS : 유저의 ELO레이팅(죽인 적 수 기반)
- KILLS : 경기 내 적을 죽인 수
- KILLSTREAKS : 짧은 시간 내에 연속으로 적을 죽인 수
- LONGESTKILL : 사살한 적까지의 거리의 최대값
- MATCHDURATION : 경기가 진행된 시간(단위:초)
- MATCHID : 경기 식별 ID
- MATCHTYPE : 경기 모드( ex: 솔로, 듀오, 스쿼드 )
- MAXPLACE : 경기 내 총 인원 수
- NUMGROUP : 경기 내 실제 참여 인원 수
- RANKPOINTS : 유저의 ELO 레이팅
- REVIVES : 유저가 팀원을 부활시킨 횟수
- RIDEDISTANCE : 이동수단을 통해 이동한 거리(단위:m)
- ROADKILLS : 이동수단으로 살해한 적의 수
- SWIMDISTANCE : 수영으로 이동한 거리(단위:m)
- TEAMKILLS : 같은 팀원을 살해한 수
- VEHICLEDESTROYS : 파괴한 이동수단의 수
- WALKDISTANCE : 도보로 이동한 거리(단위:m)
- WEAPONSACQUIRED : 획득한 무기의 수
- WINPOINTS : 유저의 ELO 레이팅(승리 횟수 기반)
- WINPLACEPERC : 현재 경기에서의 백분위 기반 유저의 순위(종속변수)

## 라이브러리 및 데이터 불러오기

In [1]:
# 데이터 분석
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# 학습을 위한 라이브러리 세팅
from sklearn.linear_model import LinearRegression   
from sklearn.linear_model import Lasso              
from sklearn.linear_model import Ridge             
from xgboost.sklearn import XGBRegressor            
from lightgbm.sklearn import LGBMRegressor 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from lightgbm.sklearn import LGBMClassifier        
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from functools import partial

# VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Hyper parameter tuning
import optuna
import sklearn.metrics as metrics

# others 
import random
import scipy as sp
from scipy import stats
import sys
import gc
import os
print(os.listdir("./data/pubg-finish-placement-prediction/"))
import warnings                      
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


['test_V2.csv', 'sample_submission_V2.csv', 'train_V2.csv']


In [2]:
train = pd.read_csv("./data/pubg-finish-placement-prediction/train_V2.csv")
test = pd.read_csv("./data/pubg-finish-placement-prediction/test_V2.csv")
submission = pd.read_csv("./data/pubg-finish-placement-prediction/sample_submission_V2.csv")

## 함수

In [3]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
#     start_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

#     end_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
#     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
def sorted_corr(data, column) :
    df_corr = data.corr()
    df_corr = df_corr.apply(lambda x: round(x ,2))
    df_corr = df_corr.unstack()
    df_corr = pd.DataFrame(df_corr[column][df_corr[column]<1].sort_values(ascending=False), columns=['Correlation'])
    df_corr = df_corr.style.background_gradient(cmap='coolwarm_r')
    return df_corr

In [5]:
def toVIF(features): 
    return pd.DataFrame({
        "feature": features.columns,
        "VIF": [variance_inflation_factor(features.values, idx)
                for idx in range(features.shape[1])]
    })

In [6]:
def OLS_summary(features, target):
    sm_feature = sm.add_constant(features)
    model = sm.OLS(target, sm_feature).fit()
    print(model.summary2())

In [7]:
from sklearn.linear_model import LinearRegression   # 1. Linear Regression
from sklearn.linear_model import Lasso              # 2. Lasso
from sklearn.linear_model import Ridge              # 3. Ridge
from xgboost.sklearn import XGBRegressor            # 4. XGBoost
from lightgbm.sklearn import LGBMRegressor          # 5. LightGBM
from sklearn.metrics import mean_absolute_error


def Linear_training_mae(target, features):
    model = LinearRegression().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae
def Lasso_training_mae(target, features):
    model = Lasso().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae
def Ridge_training_mae(target, features):
    model = Ridge().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae
def XGB_training_mae(target, features):
    model = XGBRegressor().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae
def LGBM_training_mae(target, features):
    model = LGBMRegressor().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae
def trainings_mae(target, features):
    print("1. Linear Regression\t, train=%.4f" % Linear_training_mae(target, features))
    print("2. Lasso\t\t, train=%.4f" % Lasso_training_mae(target, features))        
    print("3. Ridge\t\t, train=%.4f" % Ridge_training_mae(target, features))
    print("4. XGBoost\t\t, train=%.4f" % XGB_training_mae(target, features))
    print("5. LightGBM\t\t, train=%.4f" % LGBM_training_mae(target, features))

In [8]:
def one_hot_encoding(data):
    train_OHE = pd.get_dummies(data, columns=["matchType"])
    train_OHE = reduce_mem_usage(train_OHE)
    return train_OHE

In [9]:
def ordinal_encoding(data):
    train_OE = data.copy()
    train_OE['matchType'] = train_OE['matchType'].map({
        'solo':1,
        'solo-fpp':2,
        'duo':3,
        'duo-fpp':4,
        'squad':5,
        'squad-fpp':6,
        'normal-duo':7,
        'normal-duo-fpp':8,
        'normal-solo':9,
        'normal-solo-fpp':10,
        'normal-squad':11,
        'normal-squad-fpp':12,
        'crashfpp':13,
        'crashtpp':14,
        'flarefpp':15,
        'flaretpp':16
        })
    train_OE = reduce_mem_usage(train_OE)
    return train_OE

In [10]:
# optuna RandomForest
def optimizer_RF(trial, X, y, K):
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 8, 30)
    max_features = trial.suggest_categorical("max_features", ['auto', 'sqrt', 'log2'])
    evaluation_metric = mean_absolute_error
    
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  max_features=max_features,
                                  n_jobs=-1,
                                  random_state=0xC0FFEE)
    
    folds = KFold(n_splits=K)
    scores = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        score = evaluation_metric(y_val, preds)
        scores.append(score)
        
    return np.mean(scores)

In [65]:
# Optuna LightGBM
def optimizer_LGBM(trial, X, y, K):
    import os
    param = {
        'objective': 'regression', 
        'verbose': 0,
        'max_depth': trial.suggest_int('max_depth', 8, 20),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        #'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        #'subsample': trial.suggest_loguniform('subsample', 0.4, 1)
        #"device" : 'gpu'
    }

    model = LGBMRegressor(**param, n_jobs=os.cpu_count())
    evaluation_metric = mean_absolute_error
    
    folds = KFold(n_splits=K)
    scores = []
    
    for train_idx, val_idx in folds.split(X, y):
        
        # X_train,X_val = X[train_idx],X[val_idx]
        # y_train,y_val = y[train_idx],y[val_idx]
        
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=25)
        preds = model.predict(X_val)
        score = evaluation_metric(y_val, preds)
        scores.append(score)
    
    return np.mean(scores)

In [12]:
def dropOutlier (df):
    df = df.dropna(axis=0)
    def dropIdx(df, idx) :
        df.drop(index=idx, inplace=True)
        dropIdx.dpIdx_sum +=len(idx)
        return df
    dropIdx.dpIdx_sum = 0

    print("Pre-Processing...")
    for i in df.columns.to_list() :
        df.drop(index=df[df[i].isnull()==True].index, inplace=True)
        
    print("Droping Outliers...")

    vip_features = ["assists","boosts","DBNOs","heals","kills","killStreaks","walkDistance", "revives", "roadKills", "vehicleDestroys"]

    group = df.groupby('groupId').count()
    df = dropIdx(df, df[df.groupId.isin(group[group["Id"]>group["Id"].quantile(0.9999)].index)==True].index) #수치고려 가능

    for col in (vip_features + ["damageDealt","longestKill", "rideDistance", "swimDistance","weaponsAcquired", "matchDuration"]):
        df = dropIdx(df, df[df[col]>df[col].quantile(0.999)].index)
    
    for col in vip_features:
        df = dropIdx(df, df[df["walkDistance"]<df[col]].index)

    df = dropIdx(df, df[df.groupby('matchId')['kills'].transform('max')  > df.groupby('matchId')['Id'].transform('count')  ].index)
    df = dropIdx(df, df[(df['rideDistance']==0) & (df['roadKills']>0)  ].index)

    #edge case
    df.loc[(df.maxPlace>1)&(df.numGroups==1), "maxPlace"] = 1

    print(f"{dropIdx.dpIdx_sum} Columns has deleted!") 

    del vip_features, group      
    gc.collect()
    
    return df

In [13]:
def feature_enginnering(df):
    # create columns
    print("Making columns...")
    stat_feature = ["assists","boosts","DBNOs","heals","kills","killStreaks","walkDistance", "revives", "roadKills", "vehicleDestroys","damageDealt","longestKill", "rideDistance", "swimDistance","weaponsAcquired"]
    stat_list = ["max","mean","median","min"]
    for col in stat_feature :
        for stat in stat_list:
            df[f"{col}_{stat}"] = df.groupby("groupId")[col].transform(stat)
            df[f"{col}_{stat}_Place"] = df.groupby("matchId")[f"{col}_{stat}"].transform('rank', ascending=False)
    print(len(stat_feature)*len(stat_list)+1, f"columns Made! now {len(df.columns)}column in DF.")
    
    # create kiilRank using matchId, kills 
    df["killRank"] = df.groupby("matchId")["kills"].rank("dense", ascending=False)
    
    # drop unnecessary columns 
    print('Dropping columns...')
    df = df.drop(["Id", "groupId","matchId"], axis=1)
    df = ordinal_encoding(df)
    df = df.drop(['killPlace'], axis=1)
    df = df.drop(['damageDealt'], axis=1)
    df = df.drop(['numGroups'], axis=1)
    df = df.drop(['killPoints','rankPoints','winPoints'], axis=1)
    
    print(len(df.columns), "columns in df")    
    reduce_mem_usage(df)
    del stat_feature
    gc.collect()

    return df

In [22]:
def get_Output(df):
    df = dropOutlier(df)
    print("\n" + "-"*30 )
    df = feature_enginnering(df)
    return df 

In [15]:
def sampling(df,n):
    idx = sorted(np.random.permutation(len(df))[:n])
    return df.iloc[idx].copy()

In [29]:
def split_Target(df):
    target = df_sample['winPlacePerc']
    features = df_sample.drop(['winPlacePerc'], axis=1)
    return target, features

## 전처리

In [16]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Id               object 
 1   groupId          object 
 2   matchId          object 
 3   assists          int8   
 4   boosts           int8   
 5   damageDealt      float16
 6   DBNOs            int8   
 7   headshotKills    int8   
 8   heals            int8   
 9   killPlace        int8   
 10  killPoints       int16  
 11  kills            int8   
 12  killStreaks      int8   
 13  longestKill      float16
 14  matchDuration    int16  
 15  matchType        object 
 16  maxPlace         int8   
 17  numGroups        int8   
 18  rankPoints       int16  
 19  revives          int8   
 20  rideDistance     float16
 21  roadKills        int8   
 22  swimDistance     float16
 23  teamKills        int8   
 24  vehicleDestroys  int8   
 25  walkDistance     float16
 26  weaponsAcquired  int16  
 27  winPoints   

In [18]:
train.isna().sum()

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       1
dtype: int64

In [19]:
train = train.dropna(axis=0)

In [20]:
train.isna().sum()

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64

## Simple Test

In [23]:
n = len(train) // 10 
df_sample = sampling(train, n) 
df_sample = get_Output(train)
df_sample.shape

Pre-Processing...
Droping Outliers...
57412 Columns has deleted!

------------------------------
Making columns...
61 columns Made! now 149column in DF.
Dropping columns...
141 columns in df


In [59]:
n = len(df_sample) // 5
df_sample2 = sampling(df_sample, n) 
df_sample2.shape

(877910, 141)

In [66]:
n = len(df_sample2) // 2
df_sample3 = sampling(df_sample2, n) 
df_sample3.shape

(438955, 141)

In [34]:
target, features = split_Target(df_sample)
LGBM_training_mae(target,features)

0.0602

In [None]:
df = get_Output(train)

## Hyper parameter tuning

In [39]:
target, features = split_Target(df_sample)
X = features
y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0xC0FFEE)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0xC0FFEE)

print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)
gc.collect()

(2633731, 140) (877911, 140) (877911, 140) (2633731,) (877911,) (877911,)


10869

In [None]:
from sklearn.model_selection import GridSearchCV
model = LGBMRegressor()
param_grid = {
    "objective": "regression", 
    "metric" : "mae",
    "verbose" : [-1], 
    "max_depth" : [3, 4, -1],
    "n_estimators" : [50, 100],
    "learning_rate" : [0.01, 0.001, 0.0025],
}

gcv = GridSearchCV(estimator=model, param_grid=param_grid, cv=5,
                  n_jobs=-1, verbose=1)

gcv.fit(X_train, y_train)
print("Best Estimator : ", gcv.best_estimator_)

In [None]:
print("Prediction with Best Estimator")
gcv_pred_train = gcv.predict(X_train)
gcv_pred_test = gcv.predict(X_test)

gcv_train_score = evaluation_metric(y_train, gcv_pred_train)
gcv_test_score = evaluation_metric(y_test, gcv_pred_test)

print("Train MAE Score : %.4f" % gcv_train_score)
print("Test MAE Score : %.4f" % gcv_test_score)

In [None]:
print("Performance Gain")
print("in train : ", (train_score - gcv_train_score))
print("in test : ", (test_score - gcv_test_score))

## Optuna 

In [67]:
target, features = split_Target(df_sample3)
LGBM_training_mae(target,features)

0.0602

In [68]:
target, features = split_Target(df_sample3)
X = features
y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0xC0FFEE)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0xC0FFEE)

print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)
gc.collect()

(2633731, 140) (877911, 140) (877911, 140) (2633731,) (877911,) (877911,)


513

In [None]:
K = 5
opt_func = partial(optimizer_LGBM, X=X_train, y=y_train, K=K)

lgbm_study = optuna.create_study(study_name="LGBM", direction="maximize")
lgbm_study.optimize(opt_func, n_trials=3)

[32m[I 2022-06-16 15:18:33,386][0m A new study created in memory with name: LGBM[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's l2: 0.0936421
Training until validation scores don't improve for 25 rounds
[2]	valid_0's l2: 0.0936394
[3]	valid_0's l2: 0.0936367
[4]	valid_0's l2: 0.0936341
[5]	valid_0's l2: 0.0936314
[6]	valid_0's l2: 0.0936287
[7]	valid_0's l2: 0.093626
[8]	valid_0's l2: 0.0936233
[9]	valid_0's l2: 0.0936206
[10]	valid_0's l2: 0.0936179
[11]	valid_0's l2: 0.0936152
[12]	valid_0's l2: 0.0936126
[13]	valid_0's l2: 0.0936099
[14]	valid_0's l2: 0.0936072
[15]	valid_0's l2: 0.0936045
[16]	valid_0's l2: 0.0936018
[17]	valid_0's l2: 0.0935991
[18]	valid_0's l2: 0.0935964
[19]	valid_0's l2: 0.0935938
[20]	valid_0's l2: 0.0935911
[21]	valid_0's l2: 0.0935884
[22]	valid_0's l2: 0.0935857
[23]	valid_0's l2: 0.093583
[24]	valid_0's l2: 0.0935803
[25]	valid_0's l2: 0.0935776
[26]	valid_0's l2: 0.093575
[27]	valid_0's l2: 0.0935723
[28]	valid_0's l2: 0.0935696
[29]	valid_0's l2

[256]	valid_0's l2: 0.0929595
[257]	valid_0's l2: 0.0929568
[258]	valid_0's l2: 0.0929542
[259]	valid_0's l2: 0.0929515
[260]	valid_0's l2: 0.0929488
[261]	valid_0's l2: 0.0929462
[262]	valid_0's l2: 0.0929435
[263]	valid_0's l2: 0.0929409
[264]	valid_0's l2: 0.0929382
[265]	valid_0's l2: 0.0929355
[266]	valid_0's l2: 0.0929329
[267]	valid_0's l2: 0.0929302
[268]	valid_0's l2: 0.0929275
[269]	valid_0's l2: 0.0929249
[270]	valid_0's l2: 0.0929222
[271]	valid_0's l2: 0.0929195
[272]	valid_0's l2: 0.0929169
[273]	valid_0's l2: 0.0929142
[274]	valid_0's l2: 0.0929115
[275]	valid_0's l2: 0.0929089
[276]	valid_0's l2: 0.0929062
[277]	valid_0's l2: 0.0929035
[278]	valid_0's l2: 0.0929009
[279]	valid_0's l2: 0.0928982
[280]	valid_0's l2: 0.0928956
[281]	valid_0's l2: 0.0928929
[282]	valid_0's l2: 0.0928902
[283]	valid_0's l2: 0.0928876
[284]	valid_0's l2: 0.0928849
[285]	valid_0's l2: 0.0928822
[286]	valid_0's l2: 0.0928796
[287]	valid_0's l2: 0.0928769
[288]	valid_0's l2: 0.0928742
[289]	vali

[531]	valid_0's l2: 0.0922296
[532]	valid_0's l2: 0.092227
[533]	valid_0's l2: 0.0922244
[534]	valid_0's l2: 0.0922217
[535]	valid_0's l2: 0.0922191
[536]	valid_0's l2: 0.0922164
[537]	valid_0's l2: 0.0922138
[538]	valid_0's l2: 0.0922112
[539]	valid_0's l2: 0.0922085
[540]	valid_0's l2: 0.0922059
[541]	valid_0's l2: 0.0922032
[542]	valid_0's l2: 0.0922006
[543]	valid_0's l2: 0.0921979
[544]	valid_0's l2: 0.0921953
[545]	valid_0's l2: 0.0921927
[546]	valid_0's l2: 0.09219
[547]	valid_0's l2: 0.0921874
[548]	valid_0's l2: 0.0921847
[549]	valid_0's l2: 0.0921821
[550]	valid_0's l2: 0.0921795
[551]	valid_0's l2: 0.0921768
[552]	valid_0's l2: 0.0921742
[553]	valid_0's l2: 0.0921715
[554]	valid_0's l2: 0.0921689
[555]	valid_0's l2: 0.0921663
[556]	valid_0's l2: 0.0921636
[557]	valid_0's l2: 0.092161
[558]	valid_0's l2: 0.0921583
[559]	valid_0's l2: 0.0921557
[560]	valid_0's l2: 0.0921531
[561]	valid_0's l2: 0.0921504
[562]	valid_0's l2: 0.0921478
[563]	valid_0's l2: 0.0921451
[564]	valid_0'

[185]	valid_0's l2: 0.0930696
[186]	valid_0's l2: 0.0930669
[187]	valid_0's l2: 0.0930642
[188]	valid_0's l2: 0.0930615
[189]	valid_0's l2: 0.0930589
[190]	valid_0's l2: 0.0930562
[191]	valid_0's l2: 0.0930535
[192]	valid_0's l2: 0.0930509
[193]	valid_0's l2: 0.0930482
[194]	valid_0's l2: 0.0930455
[195]	valid_0's l2: 0.0930429
[196]	valid_0's l2: 0.0930402
[197]	valid_0's l2: 0.0930375
[198]	valid_0's l2: 0.0930349
[199]	valid_0's l2: 0.0930322
[200]	valid_0's l2: 0.0930295
[201]	valid_0's l2: 0.0930269
[202]	valid_0's l2: 0.0930242
[203]	valid_0's l2: 0.0930215
[204]	valid_0's l2: 0.0930189
[205]	valid_0's l2: 0.0930162
[206]	valid_0's l2: 0.0930135
[207]	valid_0's l2: 0.0930109
[208]	valid_0's l2: 0.0930082
[209]	valid_0's l2: 0.0930055
[210]	valid_0's l2: 0.0930028
[211]	valid_0's l2: 0.0930002
[212]	valid_0's l2: 0.0929975
[213]	valid_0's l2: 0.0929948
[214]	valid_0's l2: 0.0929922
[215]	valid_0's l2: 0.0929895
[216]	valid_0's l2: 0.0929868
[217]	valid_0's l2: 0.0929842
[218]	vali

[460]	valid_0's l2: 0.0923388
[461]	valid_0's l2: 0.0923361
[462]	valid_0's l2: 0.0923335
[463]	valid_0's l2: 0.0923308
[464]	valid_0's l2: 0.0923282
[465]	valid_0's l2: 0.0923255
[466]	valid_0's l2: 0.0923229
[467]	valid_0's l2: 0.0923202
[468]	valid_0's l2: 0.0923176
[469]	valid_0's l2: 0.0923149
[470]	valid_0's l2: 0.0923123
[471]	valid_0's l2: 0.0923097
[472]	valid_0's l2: 0.092307
[473]	valid_0's l2: 0.0923044
[474]	valid_0's l2: 0.0923017
[475]	valid_0's l2: 0.0922991
[476]	valid_0's l2: 0.0922964
[477]	valid_0's l2: 0.0922938
[478]	valid_0's l2: 0.0922911
[479]	valid_0's l2: 0.0922885
[480]	valid_0's l2: 0.0922859
[481]	valid_0's l2: 0.0922832
[482]	valid_0's l2: 0.0922806
[483]	valid_0's l2: 0.0922779
[484]	valid_0's l2: 0.0922753
[485]	valid_0's l2: 0.0922726
[486]	valid_0's l2: 0.09227
[487]	valid_0's l2: 0.0922674
[488]	valid_0's l2: 0.0922647
[489]	valid_0's l2: 0.0922621
[490]	valid_0's l2: 0.0922594
[491]	valid_0's l2: 0.0922568
[492]	valid_0's l2: 0.0922541
[493]	valid_0

[114]	valid_0's l2: 0.0933738
[115]	valid_0's l2: 0.0933711
[116]	valid_0's l2: 0.0933685
[117]	valid_0's l2: 0.0933658
[118]	valid_0's l2: 0.0933631
[119]	valid_0's l2: 0.0933604
[120]	valid_0's l2: 0.0933577
[121]	valid_0's l2: 0.0933551
[122]	valid_0's l2: 0.0933524
[123]	valid_0's l2: 0.0933497
[124]	valid_0's l2: 0.093347
[125]	valid_0's l2: 0.0933444
[126]	valid_0's l2: 0.0933417
[127]	valid_0's l2: 0.093339
[128]	valid_0's l2: 0.0933363
[129]	valid_0's l2: 0.0933337
[130]	valid_0's l2: 0.093331
[131]	valid_0's l2: 0.0933283
[132]	valid_0's l2: 0.0933256
[133]	valid_0's l2: 0.093323
[134]	valid_0's l2: 0.0933203
[135]	valid_0's l2: 0.0933176
[136]	valid_0's l2: 0.0933149
[137]	valid_0's l2: 0.0933123
[138]	valid_0's l2: 0.0933096
[139]	valid_0's l2: 0.0933069
[140]	valid_0's l2: 0.0933042
[141]	valid_0's l2: 0.0933016
[142]	valid_0's l2: 0.0932989
[143]	valid_0's l2: 0.0932962
[144]	valid_0's l2: 0.0932935
[145]	valid_0's l2: 0.0932909
[146]	valid_0's l2: 0.0932882
[147]	valid_0'

[389]	valid_0's l2: 0.0926409
[390]	valid_0's l2: 0.0926383
[391]	valid_0's l2: 0.0926356
[392]	valid_0's l2: 0.092633
[393]	valid_0's l2: 0.0926303
[394]	valid_0's l2: 0.0926277
[395]	valid_0's l2: 0.092625
[396]	valid_0's l2: 0.0926224
[397]	valid_0's l2: 0.0926197
[398]	valid_0's l2: 0.0926171
[399]	valid_0's l2: 0.0926144
[400]	valid_0's l2: 0.0926118
[401]	valid_0's l2: 0.0926091
[402]	valid_0's l2: 0.0926065
[403]	valid_0's l2: 0.0926038
[404]	valid_0's l2: 0.0926012
[405]	valid_0's l2: 0.0925985
[406]	valid_0's l2: 0.0925959
[407]	valid_0's l2: 0.0925932
[408]	valid_0's l2: 0.0925906
[409]	valid_0's l2: 0.0925879
[410]	valid_0's l2: 0.0925853
[411]	valid_0's l2: 0.0925826
[412]	valid_0's l2: 0.0925799
[413]	valid_0's l2: 0.0925773
[414]	valid_0's l2: 0.0925746
[415]	valid_0's l2: 0.092572
[416]	valid_0's l2: 0.0925693
[417]	valid_0's l2: 0.0925667
[418]	valid_0's l2: 0.092564
[419]	valid_0's l2: 0.0925614
[420]	valid_0's l2: 0.0925587
[421]	valid_0's l2: 0.0925561
[422]	valid_0'

[41]	valid_0's l2: 0.0935321
[42]	valid_0's l2: 0.0935295
[43]	valid_0's l2: 0.0935268
[44]	valid_0's l2: 0.0935241
[45]	valid_0's l2: 0.0935214
[46]	valid_0's l2: 0.0935187
[47]	valid_0's l2: 0.0935161
[48]	valid_0's l2: 0.0935134
[49]	valid_0's l2: 0.0935107
[50]	valid_0's l2: 0.093508
[51]	valid_0's l2: 0.0935053
[52]	valid_0's l2: 0.0935026
[53]	valid_0's l2: 0.0935
[54]	valid_0's l2: 0.0934973
[55]	valid_0's l2: 0.0934946
[56]	valid_0's l2: 0.0934919
[57]	valid_0's l2: 0.0934892
[58]	valid_0's l2: 0.0934866
[59]	valid_0's l2: 0.0934839
[60]	valid_0's l2: 0.0934812
[61]	valid_0's l2: 0.0934785
[62]	valid_0's l2: 0.0934758
[63]	valid_0's l2: 0.0934732
[64]	valid_0's l2: 0.0934705
[65]	valid_0's l2: 0.0934678
[66]	valid_0's l2: 0.0934651
[67]	valid_0's l2: 0.0934625
[68]	valid_0's l2: 0.0934598
[69]	valid_0's l2: 0.0934571
[70]	valid_0's l2: 0.0934544
[71]	valid_0's l2: 0.0934517
[72]	valid_0's l2: 0.0934491
[73]	valid_0's l2: 0.0934464
[74]	valid_0's l2: 0.0934437
[75]	valid_0's l2:

[319]	valid_0's l2: 0.0927901
[320]	valid_0's l2: 0.0927874
[321]	valid_0's l2: 0.0927848
[322]	valid_0's l2: 0.0927821
[323]	valid_0's l2: 0.0927795
[324]	valid_0's l2: 0.0927768
[325]	valid_0's l2: 0.0927741
[326]	valid_0's l2: 0.0927715
[327]	valid_0's l2: 0.0927688
[328]	valid_0's l2: 0.0927662
[329]	valid_0's l2: 0.0927635
[330]	valid_0's l2: 0.0927609
[331]	valid_0's l2: 0.0927582
[332]	valid_0's l2: 0.0927555
[333]	valid_0's l2: 0.0927529
[334]	valid_0's l2: 0.0927502
[335]	valid_0's l2: 0.0927476
[336]	valid_0's l2: 0.0927449
[337]	valid_0's l2: 0.0927423
[338]	valid_0's l2: 0.0927396
[339]	valid_0's l2: 0.092737
[340]	valid_0's l2: 0.0927343
[341]	valid_0's l2: 0.0927316
[342]	valid_0's l2: 0.092729
[343]	valid_0's l2: 0.0927263
[344]	valid_0's l2: 0.0927237
[345]	valid_0's l2: 0.092721
[346]	valid_0's l2: 0.0927184
[347]	valid_0's l2: 0.0927157
[348]	valid_0's l2: 0.0927131
[349]	valid_0's l2: 0.0927104
[350]	valid_0's l2: 0.0927078
[351]	valid_0's l2: 0.0927051
[352]	valid_0

[595]	valid_0's l2: 0.0920598
[596]	valid_0's l2: 0.0920572
[597]	valid_0's l2: 0.0920545
[598]	valid_0's l2: 0.0920519
[599]	valid_0's l2: 0.0920493
[600]	valid_0's l2: 0.0920466
Did not meet early stopping. Best iteration is:
[600]	valid_0's l2: 0.0920466
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's l2: 0.0938154
Training until validation scores don't improve for 25 rounds
[2]	valid_0's l2: 0.0938127
[3]	valid_0's l2: 0.09381
[4]	valid_0's l2: 0.0938074
[5]	valid_0's l2: 0.0938047
[6]	valid_0's l2: 0.093802
[7]	valid_0's l2: 0.0937993
[8]	valid_0's l2: 0.0937966
[9]	valid_0's l2: 0.0937939
[10]	valid_0's l2: 0.0937912
[11]	valid_0's l2: 0.0937885
[12]	valid_0's l2: 0.0937858
[13]	valid_0's l2: 0.0937832
[14]	valid_0's l2: 0.0937805
[15]	valid_0's l2: 0.0937778
[16]	valid_0's l2: 0.0937751
[17]	valid_0's l2: 0.0937724
[18]	valid_0's l2: 0.0937697
[19]	valid_0's l2: 0.093767
[20]	valid_0's l2: 0.

[249]	valid_0's l2: 0.093151
[250]	valid_0's l2: 0.0931483
[251]	valid_0's l2: 0.0931456
[252]	valid_0's l2: 0.093143
[253]	valid_0's l2: 0.0931403
[254]	valid_0's l2: 0.0931376
[255]	valid_0's l2: 0.093135
[256]	valid_0's l2: 0.0931323
[257]	valid_0's l2: 0.0931296
[258]	valid_0's l2: 0.093127
[259]	valid_0's l2: 0.0931243
[260]	valid_0's l2: 0.0931216
[261]	valid_0's l2: 0.093119
[262]	valid_0's l2: 0.0931163
[263]	valid_0's l2: 0.0931136
[264]	valid_0's l2: 0.0931109
[265]	valid_0's l2: 0.0931083
[266]	valid_0's l2: 0.0931056
[267]	valid_0's l2: 0.0931029
[268]	valid_0's l2: 0.0931003
[269]	valid_0's l2: 0.0930976
[270]	valid_0's l2: 0.0930949
[271]	valid_0's l2: 0.0930923
[272]	valid_0's l2: 0.0930896
[273]	valid_0's l2: 0.0930869
[274]	valid_0's l2: 0.0930843
[275]	valid_0's l2: 0.0930816
[276]	valid_0's l2: 0.093079
[277]	valid_0's l2: 0.0930763
[278]	valid_0's l2: 0.0930736
[279]	valid_0's l2: 0.093071
[280]	valid_0's l2: 0.0930683
[281]	valid_0's l2: 0.0930656
[282]	valid_0's l

[524]	valid_0's l2: 0.0924204
[525]	valid_0's l2: 0.0924178
[526]	valid_0's l2: 0.0924151
[527]	valid_0's l2: 0.0924125
[528]	valid_0's l2: 0.0924099
[529]	valid_0's l2: 0.0924072
[530]	valid_0's l2: 0.0924046
[531]	valid_0's l2: 0.0924019
[532]	valid_0's l2: 0.0923993
[533]	valid_0's l2: 0.0923966
[534]	valid_0's l2: 0.092394
[535]	valid_0's l2: 0.0923913
[536]	valid_0's l2: 0.0923887
[537]	valid_0's l2: 0.0923861
[538]	valid_0's l2: 0.0923834
[539]	valid_0's l2: 0.0923808
[540]	valid_0's l2: 0.0923781
[541]	valid_0's l2: 0.0923755
[542]	valid_0's l2: 0.0923728
[543]	valid_0's l2: 0.0923702
[544]	valid_0's l2: 0.0923676
[545]	valid_0's l2: 0.0923649
[546]	valid_0's l2: 0.0923623
[547]	valid_0's l2: 0.0923596
[548]	valid_0's l2: 0.092357
[549]	valid_0's l2: 0.0923543
[550]	valid_0's l2: 0.0923517
[551]	valid_0's l2: 0.0923491
[552]	valid_0's l2: 0.0923464
[553]	valid_0's l2: 0.0923438
[554]	valid_0's l2: 0.0923411
[555]	valid_0's l2: 0.0923385
[556]	valid_0's l2: 0.0923358
[557]	valid_

[32m[I 2022-06-16 15:28:43,748][0m Trial 0 finished with value: 0.2641730427314201 and parameters: {'max_depth': 14, 'learning_rate': 1.62003175419883e-05, 'n_estimators': 600}. Best is trial 0 with value: 0.2641730427314201.[0m


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's l2: 0.0936448
Training until validation scores don't improve for 25 rounds
[2]	valid_0's l2: 0.0936448
[3]	valid_0's l2: 0.0936448
[4]	valid_0's l2: 0.0936448
[5]	valid_0's l2: 0.0936448
[6]	valid_0's l2: 0.0936448
[7]	valid_0's l2: 0.0936448
[8]	valid_0's l2: 0.0936448
[9]	valid_0's l2: 0.0936448
[10]	valid_0's l2: 0.0936448
[11]	valid_0's l2: 0.0936448
[12]	valid_0's l2: 0.0936448
[13]	valid_0's l2: 0.0936448
[14]	valid_0's l2: 0.0936448
[15]	valid_0's l2: 0.0936448
[16]	valid_0's l2: 0.0936448
[17]	valid_0's l2: 0.0936448
[18]	valid_0's l2: 0.0936448
[19]	valid_0's l2: 0.0936448
[20]	valid_0's l2: 0.0936448
[21]	valid_0's l2: 0.0936448
[22]	valid_0's l2: 0.0936448
[23]	valid_0's l2: 0.0936448
[24]	valid_0's l2: 0.0936447
[25]	valid_0's l2: 0.0936447
[26]	valid_0's l2: 0.0936447
[27]	valid_0's l2: 0.0936447
[28]	valid_0's l2: 0.0936447
[29]	valid_0's

In [None]:
# optuna가 시도했던 모든 실험 관련 데이터
lgbm_study.trials_dataframe()

In [None]:
print("Best Score: %.4f" % lgbm_study.best_value) # best score 출력
print("Best params: ", lgbm_study.best_trial.params) # best score일 때의 하이퍼파라미터들

In [None]:
# 실험 기록 시각화
optuna.visualization.plot_optimization_history(lgbm_study)

In [None]:
# hyper-parameter들의 중요도
optuna.visualization.3plot_param_importances(lgbm_study)

In [None]:
trial = lgbm_study.best_trial
trial_params = trial.params

final_lgb_model = LGBMRegressor(**trial_params)
final_lgb_model.fit(X, y) # finalize model

## 최종 모델

No hyper parameter tuning 

In [46]:
target, features = split_Target(df_sample)
X = features
y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0xC0FFEE)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0xC0FFEE)

print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)
gc.collect()

(2633731, 140) (877911, 140) (877911, 140) (2633731,) (877911,) (877911,)


16084

In [47]:
model = LGBMRegressor()
model.fit(X_train, y_train)
pred = model.predict(features)
mae = np.round(mean_absolute_error(target, pred), 4)
mae

After hyper parameter tuning

In [None]:
target, features = split_Target(df_sample3)
X = features
y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0xC0FFEE)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0xC0FFEE)

print(X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape)
gc.collect()

In [None]:
'''
max_depth" : [3, 4, -1],
"n_estimators" : [50, 100],
"learning_rate" : [0.01, 0.001, 0.0025],
'''
param = {
    'objective': 'regression', 
    'metric' : 'mae',
    'verbose' : 0,
    'max_depth': 3, 
    'learning_rate': 0.05,
    'n_estimators': 200
    
}

model = LGBMRegressor(**param)
model.fit(X_train, y_train)
pred = model.predict(features)
mae = np.round(mean_absolute_error(target, pred), 4)
mae