In [174]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    VotingRegressor,
    AdaBoostRegressor,
    BaggingRegressor,
)
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import pycaret.regression as pyr


In [175]:
pubg_duoDF_ori: pd.DataFrame = pd.read_pickle("./DataFrame/pubg_duoDF.pkl")
pubg_duo_fppDF_ori: pd.DataFrame = pd.read_pickle("./DataFrame/pubg_duo_fppDF.pkl")


In [176]:
drop_features_low_connection = [
    "killPoints",
    "kills",
    "maxPlace",
    # "numGroups",
    "rideDistance",
    "roadKills",
    "swimDistance",
    "vehicleDestroys",
]


In [177]:
pubg_duoDF = pubg_duoDF_ori.drop(drop_features_low_connection, axis=1)
pubg_duo_fppDF = pubg_duo_fppDF_ori.drop(drop_features_low_connection, axis=1)


In [178]:
weaponsAcquired_mask = pubg_duoDF["weaponsAcquired"] < 25
weaponsAcquired_mask_fpp = pubg_duo_fppDF["weaponsAcquired"] < 25


In [179]:
pubg_duoDF_sample = pubg_duoDF[weaponsAcquired_mask].sample(n=12500)
pubg_duo_fppDF_sample = pubg_duo_fppDF[weaponsAcquired_mask_fpp].sample(n=12500)


In [180]:
featureDF = pubg_duoDF_sample.drop(["winPlacePerc"], axis=1)
targetSR = pubg_duoDF_sample["winPlacePerc"]

featureDF_fpp = pubg_duo_fppDF_sample.drop(["winPlacePerc"], axis=1)
targetSR_fpp = pubg_duo_fppDF_sample["winPlacePerc"]


In [181]:
X_train, X_test, y_train, y_test = train_test_split(
    featureDF, targetSR, test_size=0.2, random_state=42
)
X_train_fpp, X_test_fpp, y_train_fpp, y_test_fpp = train_test_split(
    featureDF_fpp, targetSR_fpp, test_size=0.2, random_state=42
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train_fpp.shape, y_train_fpp.shape, X_test_fpp.shape, y_test_fpp.shape)


(10000, 16) (10000,) (2500, 16) (2500,)
(10000, 16) (10000,) (2500, 16) (2500,)


In [182]:
scalers = []
scalers.append(StandardScaler())
scalers.append(MinMaxScaler())
scalers.append(RobustScaler())


In [183]:
models = []
models.append(LinearRegression())
models.append(Lasso())
models.append(Ridge())
models.append(KNeighborsRegressor())
models.append(DecisionTreeRegressor())
models.append(RandomForestRegressor())
models.append(ExtraTreesRegressor())
models.append(GradientBoostingRegressor())
models.append(SGDRegressor())
models.append(SVR())
models.append(LinearSVR())
models.append(LGBMRegressor())
models.append(XGBRegressor())


In [201]:
def find_model(models, X_train, X_test, y_train, y_test):
    scoreDict = {}
    i = 0
    for scaler in scalers:
        for model in models:
            scaler.fit(X_train)
            scaled_X_train = scaler.transform(X_train)
            scale_X_test = scaler.transform(X_test)
            model.fit(scaled_X_train, y_train)
            train_score = model.score(scaled_X_train, y_train)
            test_score = model.score(scale_X_test, y_test)
            score_diff = abs(test_score - train_score)
            train_mae = mean_absolute_error(y_train, model.predict(scaled_X_train))
            test_mae = mean_absolute_error(y_test, model.predict(scale_X_test))
            mae_diff = abs(test_mae - train_mae)
            scoreDict[i] = {
                "scaler": scaler,
                "model": model.__class__.__name__,
                "train_score": train_score,
                "test_score": test_score,
                "score_diff": score_diff,
                "train_mae": train_mae,
                "test_mae": test_mae,
                "mae_diff": mae_diff,
            }
            i += 1
    return pd.DataFrame(scoreDict).T


In [185]:
scoreDF = find_model(models, X_train, X_test, y_train, y_test)
score_fppDF = find_model(models, X_train_fpp, X_test_fpp, y_train_fpp, y_test_fpp)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1511
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 16
[LightGBM] [Info] Start training from score 0.481518
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000274 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1500
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 16
[LightGBM] [Info] Start training from score 0.481518
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi

In [186]:
scoreDF.sort_values(
    by=["test_mae", "mae_diff", "score_diff"], ascending=[True, True, True]
)


Unnamed: 0,scaler,model,train_score,test_score,score_diff,train_mae,test_mae,mae_diff
37,RobustScaler(),LGBMRegressor,0.95555,0.927852,0.027698,0.047587,0.057002,0.009416
11,StandardScaler(),LGBMRegressor,0.955631,0.927636,0.027995,0.047571,0.057008,0.009437
24,MinMaxScaler(),LGBMRegressor,0.95605,0.927147,0.028903,0.047456,0.057177,0.009721
19,MinMaxScaler(),ExtraTreesRegressor,1.0,0.925691,0.074309,0.0,0.057401,0.057401
6,StandardScaler(),ExtraTreesRegressor,1.0,0.925126,0.074874,0.0,0.057571,0.057571
32,RobustScaler(),ExtraTreesRegressor,1.0,0.925152,0.074848,0.0,0.057686,0.057686
5,StandardScaler(),RandomForestRegressor,0.989091,0.921682,0.067408,0.022457,0.059257,0.0368
31,RobustScaler(),RandomForestRegressor,0.989062,0.921097,0.067965,0.022447,0.059277,0.03683
18,MinMaxScaler(),RandomForestRegressor,0.988996,0.920364,0.068632,0.02243,0.059621,0.037191
12,StandardScaler(),XGBRegressor,0.977951,0.91843,0.059521,0.03305,0.060316,0.027266


In [187]:
score_fppDF.sort_values(
    by=["test_mae", "mae_diff", "score_diff"], ascending=[True, True, True]
)


Unnamed: 0,scaler,model,train_score,test_score,score_diff,train_mae,test_mae,mae_diff
11,StandardScaler(),LGBMRegressor,0.962036,0.936181,0.025855,0.043157,0.054042,0.010885
24,MinMaxScaler(),LGBMRegressor,0.962075,0.935948,0.026127,0.043214,0.054303,0.011089
37,RobustScaler(),LGBMRegressor,0.961763,0.935279,0.026483,0.043355,0.054312,0.010957
6,StandardScaler(),ExtraTreesRegressor,1.0,0.930082,0.069918,0.0,0.056235,0.056235
19,MinMaxScaler(),ExtraTreesRegressor,1.0,0.929987,0.070013,0.0,0.056282,0.056282
32,RobustScaler(),ExtraTreesRegressor,1.0,0.929685,0.070315,0.0,0.056453,0.056453
12,StandardScaler(),XGBRegressor,0.980505,0.932192,0.048313,0.030927,0.056491,0.025565
25,MinMaxScaler(),XGBRegressor,0.980505,0.932192,0.048313,0.030927,0.056491,0.025565
38,RobustScaler(),XGBRegressor,0.980505,0.932192,0.048313,0.030927,0.056491,0.025565
5,StandardScaler(),RandomForestRegressor,0.990604,0.92572,0.064884,0.020539,0.057865,0.037326


In [206]:
auto_ml = pyr.setup(
    pubg_duoDF_sample,
    target="winPlacePerc",
    train_size=0.8,
    normalize=True,
    normalize_method="robust",
)


Unnamed: 0,Description,Value
0,Session id,8107
1,Target,winPlacePerc
2,Target type,Regression
3,Original data shape,"(12500, 17)"
4,Transformed data shape,"(12500, 17)"
5,Transformed train set shape,"(10000, 17)"
6,Transformed test set shape,"(2500, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


In [209]:
best_models = pyr.compare_models(sort="MAE", fold=5, n_select=10)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.0573,0.0064,0.0802,0.9298,0.0522,0.1934,1.306
lightgbm,Light Gradient Boosting Machine,0.0586,0.0067,0.0818,0.9269,0.0534,0.1945,0.156
et,Extra Trees Regressor,0.0604,0.0072,0.0847,0.9218,0.0554,0.2058,0.688
xgboost,Extreme Gradient Boosting,0.0612,0.0073,0.0857,0.9199,0.0557,0.2035,0.094
rf,Random Forest Regressor,0.0617,0.0076,0.0869,0.9176,0.0566,0.2061,1.086
gbr,Gradient Boosting Regressor,0.0629,0.0077,0.0877,0.9161,0.057,0.2163,0.358
huber,Huber Regressor,0.0841,0.0138,0.1175,0.8493,0.0762,0.3303,0.036
lr,Linear Regression,0.085,0.0137,0.117,0.8506,0.0767,0.3466,0.926
br,Bayesian Ridge,0.0851,0.0137,0.117,0.8506,0.0767,0.347,0.026
ridge,Ridge Regression,0.0851,0.0137,0.117,0.8506,0.0767,0.347,0.018


In [210]:
auto_ml_fpp = pyr.setup(
    pubg_duo_fppDF_sample,
    target="winPlacePerc",
    train_size=0.8,
    normalize=True,
    normalize_method="robust",
)


Unnamed: 0,Description,Value
0,Session id,2793
1,Target,winPlacePerc
2,Target type,Regression
3,Original data shape,"(12500, 17)"
4,Transformed data shape,"(12500, 17)"
5,Transformed train set shape,"(10000, 17)"
6,Transformed test set shape,"(2500, 17)"
7,Numeric features,16
8,Preprocess,True
9,Imputation type,simple


In [211]:
best_models_fpp = pyr.compare_models(sort="MAE", fold=5, n_select=10)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.0521,0.0054,0.0733,0.9408,0.0477,0.174,1.21
lightgbm,Light Gradient Boosting Machine,0.0531,0.0055,0.0743,0.9391,0.0484,0.1757,0.122
et,Extra Trees Regressor,0.0551,0.006,0.0773,0.934,0.0505,0.1861,0.596
xgboost,Extreme Gradient Boosting,0.0561,0.0062,0.0788,0.9315,0.0514,0.1836,0.074
rf,Random Forest Regressor,0.0562,0.0062,0.0789,0.9314,0.0514,0.187,0.976
gbr,Gradient Boosting Regressor,0.0574,0.0066,0.0812,0.9274,0.0528,0.1973,0.344
dt,Decision Tree Regressor,0.0778,0.0123,0.1107,0.865,0.0718,0.2491,0.032
huber,Huber Regressor,0.0782,0.0117,0.1083,0.8707,0.0715,0.3211,0.032
ridge,Ridge Regression,0.0791,0.0116,0.1077,0.8721,0.0714,0.333,0.016
lr,Linear Regression,0.0791,0.0116,0.1077,0.8721,0.0713,0.3326,0.022


In [212]:
catModel = best_models[0]
catModel_fpp = best_models_fpp[0]


In [213]:
cat_scoreDF = find_model([catModel], X_train, X_test, y_train, y_test)
cat_scoreDF_fpp = find_model(
    [catModel_fpp], X_train_fpp, X_test_fpp, y_train_fpp, y_test_fpp
)


In [214]:
cat_scoreDF.sort_values(
    by=["test_mae", "mae_diff", "score_diff"], ascending=[True, True, True]
)


Unnamed: 0,scaler,model,train_score,test_score,score_diff,train_mae,test_mae,mae_diff
1,MinMaxScaler(),CatBoostRegressor,0.960917,0.929777,0.03114,0.044309,0.055673,0.011364
0,StandardScaler(),CatBoostRegressor,0.960917,0.929774,0.031143,0.044309,0.05568,0.011371
2,RobustScaler(),CatBoostRegressor,0.960917,0.929772,0.031146,0.044309,0.05568,0.011371


In [215]:
cat_scoreDF_fpp.sort_values(
    by=["test_mae", "mae_diff", "score_diff"], ascending=[True, True, True]
)


Unnamed: 0,scaler,model,train_score,test_score,score_diff,train_mae,test_mae,mae_diff
1,MinMaxScaler(),CatBoostRegressor,0.966946,0.937784,0.029162,0.040328,0.053507,0.013179
0,StandardScaler(),CatBoostRegressor,0.966946,0.937784,0.029162,0.040328,0.053507,0.013179
2,RobustScaler(),CatBoostRegressor,0.966946,0.937747,0.029199,0.040328,0.05352,0.013192


In [218]:
import joblib, os


In [219]:
model_dir = "./model/"
model_filename = model_dir + "pubg_duo.pkl"
model_fpp_filename = model_dir + "pubg_duo_fpp.pkl"

if not os.path.exists(model_dir):
    os.makedirs(model_dir)


In [220]:
joblib.dump(catModel, model_filename)


['./model/pubg_duo.pkl']

In [221]:
joblib.dump(catModel_fpp, model_fpp_filename)


['./model/pubg_duo_fpp.pkl']

In [222]:
pubg_duoDF_sample.columns


Index(['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'killPlace', 'killStreaks', 'longestKill', 'matchDuration', 'numGroups',
       'revives', 'teamKills', 'walkDistance', 'weaponsAcquired',
       'winRankPoints', 'winPlacePerc'],
      dtype='object')