## 1- Imports

In [1]:
# import warnings
# warnings.filterwarnings("ignore")

In [32]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score

import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
history_df = pd.read_csv("../Data/processed/dfd_cle_RR_viz_pow_dr_enc_feng.csv")

In [3]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    
    # iterate through all the columns of a dataframe and modify the data type
    #   to reduce memory usage.        
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
history_df = reduce_mem_usage(history_df)

Memory usage of dataframe is 872.57 MB
Memory usage after optimization is: 213.95 MB
Decreased by 75.5%


## 2- Preprocessing

In [58]:
print(history_df.shape)
history_df_frac = history_df 
print(history_df_frac.shape)

(4398809, 26)
(4398809, 26)


In [15]:
X = history_df_frac.drop(['winPlacePerc'], axis=1)
y = history_df_frac['winPlacePerc']  

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3519047, 25), (3519047,), (879762, 25), (879762,))

## 3- Model Building 

**NOTE** even though we reduced the skewness of the data by removing a significant number of outliers, the data is still fairly skewed and there still number of data points that can be considered outliers, the main reason for not removing all the potential outliers was to not cut too much from the data as to affect the predictions made by the model.

In [60]:
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.kernel_ridge import KernelRidge

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

from sklearn.svm import SVR

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

from sklearn.neighbors import KNeighborsRegressor

In [61]:
#Validation function
n_folds = 5

rmse_cv = {}
acc_cv = {}

def rmsle_cv(model, model_name):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    acc = cross_val_score(model, X_train.values, y_train, cv = kf)
    rmse_cv[model_name] = rmse.mean()
    acc_cv[model_name] = acc.mean()
    
    return (rmse)

### 3.1 linear regression

this model is sensitive to outliers, so we use the sklearn's Robustscaler() method on pipeline to account for outliers. 

In [87]:
linear = make_pipeline(RobustScaler(), LinearRegression(fit_intercept=True))

In [88]:
score = rmsle_cv(linear, 'Linear Regression')
print("\nLinear score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Linear score: 0.1483 (0.0010)



### 3.2 Lasso Regression

this model is sensitive to outliers, so we use the sklearn's Robustscaler() method on pipeline to account for outliers. 

In [89]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=42))

In [90]:
score = rmsle_cv(lasso, 'Lasso Regression')
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.1423 (0.0001)



### 3.3 Elastic Net 

this model is sensitive to outliers, so we use the sklearn's Robustscaler() method on pipeline to account for outliers.

In [91]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [92]:
score = rmsle_cv(ENet, 'Elastic Net')
print("\nElastic Net score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Elastic Net score: 0.1423 (0.0001)



### 3.4 Kernel Ridge

Kernel ridge regression (KRR) combines ridge regression (linear least squares with l2-norm regularization) with the kernel trick. It thus learns a linear function in the space induced by the respective kernel and the data. 

In [93]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [94]:
# score = rmsle_cv(KRR)
# print("\nKernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** this model takes too much memory!

### 3.5 Gradient Boosting Regression

With huber loss that makes it robust to outliers

In [95]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42)

In [96]:
# score = rmsle_cv(GBoost)
# print("\nGradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** model takes too long in training!

### 3.6 XGBoost

In [97]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [98]:
# score = rmsle_cv(model_xgb)
# print("\nExtreme Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** model takes too long in training!

### 3.7 LightGBM

gradient boosting framework based on decision trees to increases the efficiency of the model and reduces memory usage

In [62]:
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=50,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11, silent=True)

In [63]:
score = rmsle_cv(model_lgb, 'Light Gradient Boosting (manual-tuned)')
print("\nLight Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))








































Light Gradient Boosting score: 0.0860 (0.0001)



### 3.8 SVM regressor

In [101]:
SVMR = SVR(C=1.0, epsilon=0.2)

In [102]:
# score = rmsle_cv(SVMR)
# print("\nLight Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** model takes too long in training!

### 3.9 Decision Tree

In [103]:
DT = DecisionTreeRegressor(random_state=42)

In [104]:
score = rmsle_cv(DT, 'Decision Tree')
print("\nDecision Tree score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Decision Tree score: 0.1383 (0.0001)



### 3.10 random forest

In [105]:
randf = RandomForestRegressor(max_depth=2, random_state=42)

In [106]:
# score = rmsle_cv(randf, 'Random Forest')
# print("\nRandom Forest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Random Forest score: 0.1534 (0.0001)



**NOTE** model takes long in training, but still in an acceptable time frame.

### 3.11 Bayesian Regression 

In [107]:
kernel = DotProduct() + WhiteKernel()
gpr = GaussianProcessRegressor(kernel=kernel, random_state=42)

In [108]:
# score = rmsle_cv(gpr)
# print("\nBayesian Regression score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** model takes too much memory

### 3.12 KNN 

In [109]:
neigh = KNeighborsRegressor(n_neighbors=5)

In [110]:
# score = rmsle_cv(neigh)
# print("\nK-Nearest Neighbors score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** takes too long to run

## 4- Model Comparisons

**metrics to use**
- root mean squared error
- mean squared error
- mean absolute error

In [111]:
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [12]:
eval_scores = {}

def show_rmse(predicted, expected):
    # calculate errors
    errors = list()
    for i in range(len(expected)):
        # calculate error
        err = (expected[i] - predicted[i])**2
        # store error
        errors.append(err)
    #     # report error
    #     print('>%.1f, %.1f = %.3f' % (expected[i], predicted[i], err))

    # plot errors
    plt.plot(errors)
    plt.xticks(ticks=[i for i in range(len(errors))], labels=predicted)
    plt.xlabel('Predicted Value')
    plt.ylabel('Mean Squared Error')
    plt.show()

def evaluating_predictions(model, model_name):
    model.fit(X_train, y_train)
    
    expected = y_test.reset_index(drop=True)
    predicted = model.predict(X_test)
    
    RMSE = mean_squared_error(expected, predicted, squared=False)
    MSE = mean_squared_error(expected, predicted)
    MAE = mean_absolute_error(expected, predicted)
    
    global eval_scores
    eval_scores[model_name] = {}
    eval_scores[model_name]['RMSE'] = RMSE
    eval_scores[model_name]['MSE'] = MSE
    eval_scores[model_name]['MAE'] = MAE
    
    print('model {} ---> RMSE: {:.5f} / MSE: {:.5f} / MAE: {:.5f}'
          .format(model_name, RMSE, MSE, MAE))
    
    
    

In [64]:
# evaluating_predictions(linear, 'Linear Regression')
# evaluating_predictions(lasso, 'Lasso Regression')
# evaluating_predictions(ENet, 'Elastic Net')

# evaluating_predictions(KRR, 'Kernel Ridge Regression')  # takes too much memory
# evaluating_predictions(GBoost, 'Gradient Boosting')  # takes too long
# evaluating_predictions(model_xgb, 'Extreme Gradient Boosting')  # takes too long

evaluating_predictions(model_lgb, 'Light Gradient Boosting (manual-tuned)')

# evaluating_predictions(SVMR, 'Support Vector Machine')  # takes too long

# evaluating_predictions(DT, 'Decision Tree')
# evaluating_predictions(randf, 'Random Forest')

# evaluating_predictions(gpr, 'Gaussian Process Regressor')  # takes too much memory
# evaluating_predictions(neigh, 'K-Nearest Neigbors')  # takes too long



model Light Gradient Boosting (manual-tuned) ---> RMSE: 0.08614 / MSE: 0.00742 / MAE: 0.06181


In [114]:
# making of dataframe for clearer view of overall results
models = []
RMSE = []
MSE = []
MAE = []
RMSE_cv = []
ACC_cv = []

for models_name, eval_score in eval_scores.items():
    models.append(models_name)
    RMSE.append(eval_score['RMSE'])
    MSE.append(eval_score['MSE'])
    MAE.append(eval_score['MAE'])

for models_name, eval_score_cv in rmse_cv.items():
    RMSE_cv.append(rmse_cv[models_name])
    
for models_name, eval_score_cv in acc_cv.items():
    ACC_cv.append(acc_cv[models_name])


In [116]:
eval_results_df = df = pd.DataFrame(list(zip(RMSE, MSE, MAE, RMSE_cv, ACC_cv)),
               columns =['RMSE', 'MSE', 'MAE', 'RMSE_cv', 'ACC_cv'], index=models)

eval_results_df

Unnamed: 0,RMSE,MSE,MAE,RMSE_cv,ACC_cv
Linear Regression,0.157704,0.024871,0.120478,0.148252,0.765542
Lasso Regression,0.142363,0.020267,0.105608,0.142348,0.783855
Elastic Net,0.142292,0.020247,0.105513,0.14228,0.784063
Light Gradient Boosting (manual-tuned),0.099299,0.00986,0.069945,0.099252,0.894919
Decision Tree,0.138136,0.019081,0.094493,0.13832,0.795914
Random Forest,0.153571,0.023584,0.118357,0.153371,0.749085


**INSIGHT** Light Gradient Boosting had the most accurate predictions, with lowest error scores across RMSE, MSE, MAE, and cross validation RMSE error. the model also had the highest accuracy by cross validation.

**CONCLUSION** Light Gradient Boosting is the best algorithm used for training the model. we're going to further tune it and use it on various levels of processed data (**dfd_cle_RR.csv**, **dfd_cle_RR_viz.csv**, **dfd_cle_RR_viz_pow.csv**, **dfd_cle_RR_viz_pow_dr.csv**, **dfd_cle_RR_viz_pow_dr_enc_feng.csv**) to determine the optimal current model and data for our task.

## 5- LGBM training experiments

### 5.1 training experiment 1

comparing oputna experiments

In [17]:
# params = {
#         "objective" : "regression", 
#         "metric" : "mae", 
#         "num_leaves" : 149, 
#         "learning_rate" : 0.03, 
#         "bagging_fraction" : 0.9,
#         "bagging_seed" : 0, 
#         "num_threads" : 4,
#         "colsample_bytree" : 0.5,
#         'min_data_in_leaf':1900, 
#         'min_split_gain':0.00011,
#         'lambda_l2':9
# }

# experiment 1
params_1 = {
    "reg_alpha": 0.014165072191620626,
    "reg_lambda": 2.2338415660220567, 
    'colsample_bytree': 0.5,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'max_depth': 10,
    'num_leaves': 747,
    'min_child_samples': 65,
    'min_data_per_groups': 73
}

# experiment 2
params_2 = {
    "reg_alpha": 0.5498971158646316,
    "reg_lambda": 0.006130101000636508, 
    'colsample_bytree': 0.6,
    'subsample': 0.4,
    'learning_rate': 0.01,
    'max_depth': 10,
    'num_leaves': 468,
    'min_child_samples': 77,
    'min_data_per_groups': 93
}
            
# experiment 3
params_3 = {
    "reg_alpha": 0.0016643092201414056,
    "reg_lambda": 0.17444450082323898, 
    'colsample_bytree': 0.5,
    'subsample': 0.5,
    'learning_rate': 0.01,
    'max_depth': 20,
    'num_leaves': 793,
     'min_child_samples': 77,
    'min_data_per_groups': 93
}


lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

model_1 = lgb.train(params_1, 
                    train_set = lgb_train,
                    num_boost_round=9400,
                    early_stopping_rounds=200,
                    verbose_eval=100, 
                    valid_sets=lgb_eval
)




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3400
[LightGBM] [Info] Number of data points in the train set: 3519047, number of used features: 25
[LightGBM] [Info] Start training from score 0.469020
Training until validation scores don't improve for 200 rounds
[100]	valid_0's l2: 0.024263
[200]	valid_0's l2: 0.0128735
[300]	valid_0's l2: 0.0103517
[400]	valid_0's l2: 0.00966359
[500]	valid_0's l2: 0.00936623


[600]	valid_0's l2: 0.0092


[700]	valid_0's l2: 0.00909316
[800]	valid_0's l2: 0.00901637


[900]	valid_0's l2: 0.00896549


[1000]	valid_0's l2: 0.00892785


[1100]	valid_0's l2: 0.00890016


[1200]	valid_0's l2: 0.00887699
[1300]	valid_0's l2: 0.00885535


[1400]	valid_0's l2: 0.00884062


[1500]	valid_0's l2: 0.00882841


[1600]	valid_0's l2: 0.00881708


[1700]	valid_0's l2: 0.00880817


[1800]	valid_0's l2: 0.00880142


[1900]	valid_0's l2: 0.00879144


[2000]	valid_0's l2: 0.0087811


[2100]	valid_0's l2: 0.00877366


[2200]	valid_0's l2: 0.0087675


[2300]	valid_0's l2: 0.00876171
[2400]	valid_0's l2: 0.00875575


[2500]	valid_0's l2: 0.00874982


[2600]	valid_0's l2: 0.00874502


[2700]	valid_0's l2: 0.00874009


[2800]	valid_0's l2: 0.00873568


[2900]	valid_0's l2: 0.00873166


[3000]	valid_0's l2: 0.00872699


[3100]	valid_0's l2: 0.00872236


[3200]	valid_0's l2: 0.00871838


[3300]	valid_0's l2: 0.00871341


[3400]	valid_0's l2: 0.00870752


[3500]	valid_0's l2: 0.0087036
[3600]	valid_0's l2: 0.00869891


[3700]	valid_0's l2: 0.0086964


[3800]	valid_0's l2: 0.00869195


[3900]	valid_0's l2: 0.00868569


[4000]	valid_0's l2: 0.00868129


[4100]	valid_0's l2: 0.00867789


[4200]	valid_0's l2: 0.00867513


[4300]	valid_0's l2: 0.00867195


[4400]	valid_0's l2: 0.00866852


[4500]	valid_0's l2: 0.0086652


[4600]	valid_0's l2: 0.008662
[4700]	valid_0's l2: 0.00865997


[4800]	valid_0's l2: 0.00865798


[4900]	valid_0's l2: 0.00865599


[5000]	valid_0's l2: 0.00865133


[5100]	valid_0's l2: 0.00864826


[5200]	valid_0's l2: 0.00864578


[5300]	valid_0's l2: 0.0086435


[5400]	valid_0's l2: 0.00864175


[5500]	valid_0's l2: 0.00863735


[5600]	valid_0's l2: 0.0086332


[5700]	valid_0's l2: 0.00863052


[5800]	valid_0's l2: 0.00862821
[5900]	valid_0's l2: 0.00862593


[6000]	valid_0's l2: 0.00862375


[6100]	valid_0's l2: 0.008622


[6200]	valid_0's l2: 0.00861981


[6300]	valid_0's l2: 0.00861776


[6400]	valid_0's l2: 0.00861586


[6500]	valid_0's l2: 0.00861316


[6600]	valid_0's l2: 0.00861188


[6700]	valid_0's l2: 0.00860797


[6800]	valid_0's l2: 0.0086051


[6900]	valid_0's l2: 0.00860323


[7000]	valid_0's l2: 0.00860078
[7100]	valid_0's l2: 0.00859805


[7200]	valid_0's l2: 0.00859643


[7300]	valid_0's l2: 0.00859517


[7400]	valid_0's l2: 0.00859463


[7500]	valid_0's l2: 0.00859388


[7600]	valid_0's l2: 0.00859128


[7700]	valid_0's l2: 0.00858976


[7800]	valid_0's l2: 0.00858786


[7900]	valid_0's l2: 0.00858579


[8000]	valid_0's l2: 0.00858428


[8100]	valid_0's l2: 0.00858142
[8200]	valid_0's l2: 0.0085784


[8300]	valid_0's l2: 0.00857734


[8400]	valid_0's l2: 0.0085765


[8500]	valid_0's l2: 0.00857561


[8600]	valid_0's l2: 0.00857469


[8700]	valid_0's l2: 0.00857372


[8800]	valid_0's l2: 0.00857243


[8900]	valid_0's l2: 0.00857129


[9000]	valid_0's l2: 0.00856834


[9100]	valid_0's l2: 0.00856621


[9200]	valid_0's l2: 0.00856458
[9300]	valid_0's l2: 0.0085629


[9400]	valid_0's l2: 0.00856142
Did not meet early stopping. Best iteration is:
[9399]	valid_0's l2: 0.00856142


In [18]:
model_2 = lgb.train(params_2, 
                    train_set = lgb_train,
                    num_boost_round=9400,
                    early_stopping_rounds=200,
                    verbose_eval=100, 
                    valid_sets=lgb_eval
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3400
[LightGBM] [Info] Number of data points in the train set: 3519047, number of used features: 25
[LightGBM] [Info] Start training from score 0.469020
Training until validation scores don't improve for 200 rounds
[100]	valid_0's l2: 0.023604
[200]	valid_0's l2: 0.0123471
[300]	valid_0's l2: 0.0100891
[400]	valid_0's l2: 0.00949457
[500]	valid_0's l2: 0.00924516
[600]	valid_0's l2: 0.00911673
[700]	valid_0's l2: 0.00903183
[800]	valid_0's l2: 0.00897312
[900]	valid_0's l2: 0.00893371
[1000]	valid_0's l2: 0.00890325


[1100]	valid_0's l2: 0.00888096
[1200]	valid_0's l2: 0.00886061


[1300]	valid_0's l2: 0.00884706
[1400]	valid_0's l2: 0.0088339


[1500]	valid_0's l2: 0.00882167
[1600]	valid_0's l2: 0.00881247


[1700]	valid_0's l2: 0.00880195


[1800]	valid_0's l2: 0.00879491


[1900]	valid_0's l2: 0.00878755
[2000]	valid_0's l2: 0.00877821


[2100]	valid_0's l2: 0.00877081


[2200]	valid_0's l2: 0.00876389


[2300]	valid_0's l2: 0.00875761
[2400]	valid_0's l2: 0.00874905


[2500]	valid_0's l2: 0.00874394


[2600]	valid_0's l2: 0.00873826


[2700]	valid_0's l2: 0.00873348
[2800]	valid_0's l2: 0.00872715


[2900]	valid_0's l2: 0.00872098


[3000]	valid_0's l2: 0.00871546


[3100]	valid_0's l2: 0.00870954
[3200]	valid_0's l2: 0.00870339


[3300]	valid_0's l2: 0.00869864


[3400]	valid_0's l2: 0.00869409


[3500]	valid_0's l2: 0.00868942
[3600]	valid_0's l2: 0.00868321


[3700]	valid_0's l2: 0.00867891


[3800]	valid_0's l2: 0.00867512


[3900]	valid_0's l2: 0.00867177
[4000]	valid_0's l2: 0.00866819


[4100]	valid_0's l2: 0.00866514


[4200]	valid_0's l2: 0.00866233


[4300]	valid_0's l2: 0.00865971


[4400]	valid_0's l2: 0.0086556
[4500]	valid_0's l2: 0.00865295


[4600]	valid_0's l2: 0.00864845


[4700]	valid_0's l2: 0.00864575


[4800]	valid_0's l2: 0.00864173


[4900]	valid_0's l2: 0.00863871
[5000]	valid_0's l2: 0.00863563


[5100]	valid_0's l2: 0.0086319


[5200]	valid_0's l2: 0.00862842


[5300]	valid_0's l2: 0.00862549
[5400]	valid_0's l2: 0.00862128


[5500]	valid_0's l2: 0.00861785


[5600]	valid_0's l2: 0.00861589


[5700]	valid_0's l2: 0.00861348


[5800]	valid_0's l2: 0.00861135


[5900]	valid_0's l2: 0.00860952
[6000]	valid_0's l2: 0.00860694


[6100]	valid_0's l2: 0.00860479


[6200]	valid_0's l2: 0.00860147


[6300]	valid_0's l2: 0.00859867


[6400]	valid_0's l2: 0.00859672


[6500]	valid_0's l2: 0.00859452
[6600]	valid_0's l2: 0.00859201


[6700]	valid_0's l2: 0.00859059


[6800]	valid_0's l2: 0.00858868


[6900]	valid_0's l2: 0.00858737


[7000]	valid_0's l2: 0.0085858


[7100]	valid_0's l2: 0.00858412
[7200]	valid_0's l2: 0.00858018


[7300]	valid_0's l2: 0.00857752


[7400]	valid_0's l2: 0.00857641


[7500]	valid_0's l2: 0.00857501


[7600]	valid_0's l2: 0.00857325


[7700]	valid_0's l2: 0.00857136
[7800]	valid_0's l2: 0.00857008


[7900]	valid_0's l2: 0.00856811


[8000]	valid_0's l2: 0.00856656


[8100]	valid_0's l2: 0.00856425


[8200]	valid_0's l2: 0.00856263
[8300]	valid_0's l2: 0.00856076


[8400]	valid_0's l2: 0.00855876


[8500]	valid_0's l2: 0.00855776


[8600]	valid_0's l2: 0.00855664


[8700]	valid_0's l2: 0.00855495


[8800]	valid_0's l2: 0.00855229
[8900]	valid_0's l2: 0.00855027


[9000]	valid_0's l2: 0.00854918


[9100]	valid_0's l2: 0.00854755


[9200]	valid_0's l2: 0.00854576


[9300]	valid_0's l2: 0.00854473


[9400]	valid_0's l2: 0.00854334
Did not meet early stopping. Best iteration is:
[9398]	valid_0's l2: 0.00854333


In [30]:
model_3 = lgb.train(params_3, 
                    train_set = lgb_train,
                    num_boost_round=9400,
                    early_stopping_rounds=200,
                    verbose_eval=100, 
                    valid_sets=lgb_eval
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3400
[LightGBM] [Info] Number of data points in the train set: 3519047, number of used features: 25
[LightGBM] [Info] Start training from score 0.469020
Training until validation scores don't improve for 200 rounds
[100]	valid_0's l2: 0.0239857
[200]	valid_0's l2: 0.0126398
[300]	valid_0's l2: 0.0101331
[400]	valid_0's l2: 0.00946393
[500]	valid_0's l2: 0.00917586
[600]	valid_0's l2: 0.00901715
[700]	valid_0's l2: 0.00891582
[800]	valid_0's l2: 0.00884933
[900]	valid_0's l2: 0.00880062
[1000]	valid_0's l2: 0.00876344
[1100]	valid_0's l2: 0.00874308
[1200]	valid_0's l2: 0.0087167
[1300]	valid_0's l2: 0.00869602
[1400]	valid_0's l2: 0.00868178
[1500]	valid_0's l2: 0.00866952
[1600]	valid_0's l2: 0.00865671
[1700]	valid_0's l2: 0.0086447
[1800]	valid_0's l2: 0.00863727
[1900]	valid_0's l2: 0.00862948
[2000]	valid_0's l2: 0.0086198
[2100]	va

[4700]	valid_0's l2: 0.00850449
[4800]	valid_0's l2: 0.00850168
[4900]	valid_0's l2: 0.00849929
[5000]	valid_0's l2: 0.00849682
[5100]	valid_0's l2: 0.00849449
[5200]	valid_0's l2: 0.00849221
[5300]	valid_0's l2: 0.00849018
[5400]	valid_0's l2: 0.00848684
[5500]	valid_0's l2: 0.0084849
[5600]	valid_0's l2: 0.00848283
[5700]	valid_0's l2: 0.00848053
[5800]	valid_0's l2: 0.00847858
[5900]	valid_0's l2: 0.00847615
[6000]	valid_0's l2: 0.00847454
[6100]	valid_0's l2: 0.0084728
[6200]	valid_0's l2: 0.00847025
[6300]	valid_0's l2: 0.00846805
[6400]	valid_0's l2: 0.00846521
[6500]	valid_0's l2: 0.00846326
[6600]	valid_0's l2: 0.00846175
[6700]	valid_0's l2: 0.00845948
[6800]	valid_0's l2: 0.00845776
[6900]	valid_0's l2: 0.00845629
[7000]	valid_0's l2: 0.00845328
[7100]	valid_0's l2: 0.00845055
[7200]	valid_0's l2: 0.00844833
[7300]	valid_0's l2: 0.00844625
[7400]	valid_0's l2: 0.00844422
[7500]	valid_0's l2: 0.00844191
[7600]	valid_0's l2: 0.00844071
[7700]	valid_0's l2: 0.00843884
[7800]	val

[8000]	valid_0's l2: 0.00843414
[8100]	valid_0's l2: 0.00843272
[8200]	valid_0's l2: 0.00843169
[8300]	valid_0's l2: 0.00843021
[8400]	valid_0's l2: 0.00842921
[8500]	valid_0's l2: 0.00842772
[8600]	valid_0's l2: 0.00842684
[8700]	valid_0's l2: 0.00842601
[8800]	valid_0's l2: 0.0084239
[8900]	valid_0's l2: 0.00842248
[9000]	valid_0's l2: 0.00842122
[9100]	valid_0's l2: 0.0084206
[9200]	valid_0's l2: 0.0084201
[9300]	valid_0's l2: 0.00841978
[9400]	valid_0's l2: 0.00841954
Did not meet early stopping. Best iteration is:
[9399]	valid_0's l2: 0.00841952


In [33]:
# accuracy check
y_pred = model_1.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(0.5)
mae = mean_absolute_error(y_test, y_pred)
print('model (experiment #1)')
print("MSE: %.5f" % mse)
print("RMSE: %.5f" % rmse) 
print("MAE: %.5f" % mae) 
print('-' * 50)

y_pred = model_2.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(0.5)
mae = mean_absolute_error(y_test, y_pred)
print('model (experiment #2)')
print("MSE: %.5f" % mse)
print("RMSE: %.5f" % rmse) 
print("MAE: %.5f" % mae) 
print('-' * 50)

y_pred = model_3.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(0.5)
mae = mean_absolute_error(y_test, y_pred)
print('model (experiment #3)')
print("MSE: %.5f" % mse)
print("RMSE: %.5f" % rmse) 
print("MAE: %.5f" % mae) 
print('-' * 50)

model (experiment #1)
MSE: 0.00856
RMSE: 0.09253
MAE: 0.06400
--------------------------------------------------
model (experiment #2)
MSE: 0.00854
RMSE: 0.09243
MAE: 0.06392
--------------------------------------------------
model (experiment #3)
MSE: 0.00842
RMSE: 0.09176
MAE: 0.06326
--------------------------------------------------


In [34]:
# experiment 1
model_lgb_1 = lgb.LGBMRegressor(objective='regression', 
                              
                              reg_alpha=0.014165072191620626,
                              reg_lambda=2.2338415660220567, colsample_bytree=0.5,
                              subsample=0.6, learning_rate=0.01, max_depth=10,
                              num_leaves=747, min_child_samples=65, min_data_per_groups=73,
                              
                              n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11, silent=True)

# experiment 2
model_lgb_2 = lgb.LGBMRegressor(objective='regression', 
                              
                              reg_alpha=0.5498971158646316,
                              reg_lambda=0.006130101000636508, colsample_bytree=0.6,
                              subsample=0.4, learning_rate=0.01, max_depth=10,
                              num_leaves=468, min_child_samples=77, min_data_per_groups=93,
                              
                              n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11, silent=True)


# experiment 3
model_lgb_3 = lgb.LGBMRegressor(objective='regression', 
                              
                              reg_alpha=0.0016643092201414056,
                              reg_lambda=0.17444450082323898, colsample_bytree=0.5,
                              subsample=0.5, learning_rate=0.01, max_depth=20,
                              num_leaves=793, min_child_samples=5, min_data_per_groups=66,
                              
                              n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11, silent=True)



In [35]:
evaluating_predictions(model_lgb_1, 'Light Gradient Boosting (optuna-tuned #1)')
evaluating_predictions(model_lgb_2, 'Light Gradient Boosting (optuna-tuned #2)')
evaluating_predictions(model_lgb_3, 'Light Gradient Boosting (optuna-tuned #3)')



model Light Gradient Boosting (optuna-tuned #1) ---> RMSE: 0.11237 / MSE: 0.01263 / MAE: 0.08188




model Light Gradient Boosting (optuna-tuned #2) ---> RMSE: 0.11246 / MSE: 0.01265 / MAE: 0.08197




model Light Gradient Boosting (optuna-tuned #3) ---> RMSE: 0.11168 / MSE: 0.01247 / MAE: 0.08142


In [73]:
score = rmsle_cv(model_lgb_1, 'Light Gradient Boosting (optuna-tuned #1)')
print("\nLight Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
score = rmsle_cv(model_lgb_2, 'Light Gradient Boosting (optuna-tuned #2)')
print("\nLight Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
score = rmsle_cv(model_lgb_3, 'Light Gradient Boosting (optuna-tuned #3)')
print("\nLight Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))








































Light Gradient Boosting score: 0.0931 (0.0001)










































Light Gradient Boosting score: 0.0932 (0.0001)










































Light Gradient Boosting score: 0.0922 (0.0001)



In [82]:
print(eval_scores)
print(acc_cv)
print(rmse_cv)

{'Light Gradient Boosting (optuna-tuned #1)': {'RMSE': 0.11236841151175309, 'MSE': 0.012626659905674685, 'MAE': 0.08188497602169098}, 'Light Gradient Boosting (optuna-tuned #2)': {'RMSE': 0.11246085190562916, 'MSE': 0.012647443211339851, 'MAE': 0.08196539257591512}, 'Light Gradient Boosting (optuna-tuned #3)': {'RMSE': 0.11168408832127409, 'MSE': 0.01247333558415415, 'MAE': 0.0814153802348405}, 'Light Gradient Boosting (manual-tuned)': {'RMSE': 0.08614255990761875, 'MSE': 0.007420540627437686, 'MAE': 0.06180501097443548}}
{'Light Gradient Boosting (manual-tuned)': 0.9211163092388721, 'Light Gradient Boosting (optuna-tuned #1)': 0.9075404279022129, 'Light Gradient Boosting (optuna-tuned #2)': 0.9073213140565046, 'Light Gradient Boosting (optuna-tuned #3)': 0.9093976310090477}
{'Light Gradient Boosting (manual-tuned)': 0.08599482556481583, 'Light Gradient Boosting (optuna-tuned #1)': 0.09310104454399405, 'Light Gradient Boosting (optuna-tuned #2)': 0.09321129940723479, 'Light Gradient Bo

In [115]:
# making of dataframe for clearer view of overall results
models = []
RMSE = []
MSE = []
MAE = []
RMSE_cv = []
ACC_cv = []

for models_name, eval_score in eval_scores.items():
    models.append(models_name)
    RMSE.append(eval_score['RMSE'])
    MSE.append(eval_score['MSE'])
    MAE.append(eval_score['MAE'])

for models_name, eval_score_cv in rmse_cv.items():
    RMSE_cv.append(rmse_cv[models_name])
    
for models_name, eval_score_cv in acc_cv.items():
    ACC_cv.append(acc_cv[models_name])

In [None]:
eval_results_df.append()

In [116]:
eval_results_df = df = pd.DataFrame(list(zip(RMSE, MSE, MAE, RMSE_cv, ACC_cv)),
               columns =['RMSE', 'MSE', 'MAE', 'RMSE_cv', 'ACC_cv'], index=models)
# eval_results_df = df = pd.DataFrame(list(zip(RMSE, MSE, MAE)),
#                columns =['RMSE', 'MSE', 'MAE'], index=models)

eval_results_df

Unnamed: 0,RMSE,MSE,MAE,RMSE_cv,ACC_cv
Light Gradient Boosting (optuna-tuned #1),0.112368,0.012627,0.081885,0.093101,0.90754
Light Gradient Boosting (optuna-tuned #2),0.112461,0.012647,0.081965,0.093211,0.907321
Light Gradient Boosting (optuna-tuned #3),0.111684,0.012473,0.081415,0.092161,0.909398
Light Gradient Boosting (manual-tuned),0.086143,0.007421,0.061805,0.085995,0.921116


**NOTE** for now, manually tuned model has better results, tuning needs more experiments to deduce better model results

**CONCLUSION** we're going to use the manually-trained model

### training experiment 2

using processed features of different levels

In [124]:
import glob
from sklearn.preprocessing import OrdinalEncoder

In [52]:
def quick_clean(df, df_name):
    if 'Unnamed: 0' in df.columns:
        print('there is index feature in the dataframe, removing it..')
        df = df.drop(columns=['Unnamed: 0'], axis=1)
        
    if ('Id' in df.columns) or ('groupId' in df.columns) or ('matchId' in df.columns):
        df = df.drop(columns=['Id', 'groupId', 'matchId'], axis=1)
    
    print(df.isnull().sum())
    df = df.fillna(0)
    
    if 'enc' in df_name:
        return df
    
    # encoding
    oe = OrdinalEncoder()
    matchType = df[['matchType']]  
    oe.fit(matchType)
    matchType = pd.DataFrame(oe.transform(matchType), columns=['matchType'])  # ordinal encoder
    df['matchType'] = matchType['matchType'].astype('int16')
    
    return df

In [126]:
dtypes = {
        'assists'           : 'uint8',
        'boosts'            : 'uint8',
        'damageDealt'       : 'float32',
        'DBNOs'             : 'uint8',
        'headshotKills'     : 'uint8', 
        'heals'             : 'uint8',    
        'killPlace'         : 'uint8',    
        'killPoints'        : 'uint16',    
        'kills'             : 'uint8',    
        'killStreaks'       : 'uint8',    
        'longestKill'       : 'float32',
        'matchDuration'     : 'uint8',
        'maxPlace'          : 'uint8',    
        'numGroups'         : 'uint8',    
        'rankPoints'        : 'uint8',
        'revives'           : 'uint8',    
        'rideDistance'      : 'float32',    
        'roadKills'         : 'uint8',    
        'swimDistance'      : 'float32',    
        'teamKills'         : 'uint8',    
        'vehicleDestroys'   : 'uint8',    
        'walkDistance'      : 'float32',    
        'weaponsAcquired'   : 'uint8',    
        'winPoints'         : 'uint8', 
        'winPlacePerc'      : 'float32' 
}

In [129]:
MAE_diff_data_results = {}

for df_n in glob.glob('../Data/processed/*.csv'):
    print('df: {}'.format(df_n))
    name = df_n.split('\\')[-1]
    try:
        df = pd.read_csv(df_n, dtype=dtypes)
    except:
        df = pd.read_csv(df_n)
        df = reduce_mem_usage(df)
    
#     display(df.describe().T)
#     display(df.isnull().sum())
    
    df = quick_clean(df, df_n)
    display(df.describe().T)
    
    X = df.drop(['winPlacePerc'], axis=1)
    y = df['winPlacePerc']  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    model_lgb_1.fit(X_train, y_train)
    expected = y_test.reset_index(drop=True)
    predicted = model_lgb_1.predict(X_test)

    MAE = mean_absolute_error(expected, predicted)
    print('printing MAE result..', MAE)
    MAE_diff_data_results[name] = MAE
    
    print('-' * 50)
    

df: ../Data/processed\dfd_cle_RR.csv
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
assists,4446965.0,0.233815,0.588573,0.0,0.0,0.0,0.0,22.0
boosts,4446965.0,1.106908,1.715794,0.0,0.0,0.0,2.0,33.0
damageDealt,4446965.0,130.633148,169.886963,0.0,0.0,84.239998,186.0,6616.0
DBNOs,4446965.0,0.657876,1.145743,0.0,0.0,0.0,1.0,53.0
headshotKills,4446965.0,0.22682,0.602155,0.0,0.0,0.0,0.0,64.0
heals,4446965.0,1.370148,2.679982,0.0,0.0,0.0,2.0,80.0
killPlace,4446965.0,47.599361,27.462931,1.0,24.0,47.0,71.0,101.0
killPoints,4446965.0,505.006156,627.504921,0.0,0.0,0.0,1172.0,2170.0
kills,4446965.0,0.924784,1.558445,0.0,0.0,0.0,1.0,72.0
killStreaks,4446965.0,0.543955,0.710972,0.0,0.0,0.0,1.0,20.0


(4002268, 25) (4002268,) (444697, 25) (444697,)




printing MAE result.. 0.06728436807581147
--------------------------------------------------
df: ../Data/processed\dfd_cle_RR_viz.csv
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
assists,4398809.0,0.227439,0.569193,0.0,0.0,0.0,0.0,8.0
boosts,4398809.0,1.081899,1.684643,0.0,0.0,0.0,2.0,13.0
damageDealt,4398809.0,126.990242,158.678497,0.0,0.0,83.199997,183.5,1996.0
DBNOs,4398809.0,0.64102,1.081852,0.0,0.0,0.0,1.0,14.0
headshotKills,4398809.0,0.216572,0.555536,0.0,0.0,0.0,0.0,10.0
heals,4398809.0,1.303875,2.458517,0.0,0.0,0.0,1.0,18.0
killPlace,4398809.0,47.894633,27.398818,1.0,24.0,48.0,71.0,101.0
killPoints,4398809.0,505.146998,627.312757,0.0,0.0,0.0,1172.0,2170.0
kills,4398809.0,0.890233,1.431636,0.0,0.0,0.0,1.0,10.0
killStreaks,4398809.0,0.536673,0.701887,0.0,0.0,0.0,1.0,5.0


(3958928, 25) (3958928,) (439881, 25) (439881,)




printing MAE result.. 0.067296075133743
--------------------------------------------------
df: ../Data/processed\dfd_cle_RR_viz_pow.csv
Memory usage of dataframe is 973.25 MB
Memory usage after optimization is: 343.99 MB
Decreased by 64.7%
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
assists,4398809.0,4.6e-05,0.0,-0.456787,-0.456787,-0.456787,-0.456787,2.205078
boosts,4398809.0,1.8e-05,0.0,-0.841797,-0.841797,-0.841797,1.157227,1.8125
damageDealt,4398809.0,3.1e-05,0.0,-1.394531,-1.394531,0.267334,0.76416,2.904297
DBNOs,4398809.0,3.2e-05,0.0,-0.765625,-0.765625,-0.765625,1.092773,1.804688
headshotKills,4398809.0,-4.7e-05,0.0,-0.446045,-0.446045,-0.446045,-0.446045,2.255859
heals,4398809.0,-0.000104,0.0,-0.793457,-0.793457,-0.793457,0.758301,1.783203
killPlace,4398809.0,2e-06,0.0,-2.033203,-0.810059,0.095825,0.847168,1.731445
killPoints,4398809.0,-0.000114,0.0,-0.821777,-0.821777,-0.821777,1.208984,1.296875
kills,4398809.0,0.000149,0.0,-0.843262,-0.843262,-0.843262,0.805664,1.834961
killStreaks,4398809.0,-0.000202,0.0,-0.855957,-0.855957,-0.855957,1.025391,1.927734


(3958928, 25) (3958928,) (439881, 25) (439881,)




printing MAE result.. 0.06730961542130266
--------------------------------------------------
df: ../Data/processed\dfd_cle_RR_viz_pow_dr.csv
Memory usage of dataframe is 704.77 MB
Memory usage after optimization is: 201.36 MB
Decreased by 71.4%
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
matchDuration      0
matchType          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPlacePerc       0
PC1                0
PC2                0
PC3                0
dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
assists,4398809.0,4.6e-05,0.0,-0.456787,-0.456787,-0.456787,-0.456787,2.205078
boosts,4398809.0,1.8e-05,0.0,-0.841797,-0.841797,-0.841797,1.157227,1.8125
damageDealt,4398809.0,3.1e-05,0.0,-1.394531,-1.394531,0.267334,0.76416,2.904297
DBNOs,4398809.0,3.2e-05,0.0,-0.765625,-0.765625,-0.765625,1.092773,1.804688
headshotKills,4398809.0,-4.7e-05,0.0,-0.446045,-0.446045,-0.446045,-0.446045,2.255859
heals,4398809.0,-0.000104,0.0,-0.793457,-0.793457,-0.793457,0.758301,1.783203
matchDuration,4398809.0,-7e-06,0.0,-2.472656,-0.589844,-0.035736,0.708984,1.935547
matchType,4398809.0,10.827492,5.329678,0.0,3.0,14.0,15.0,15.0
rankPoints,4398809.0,6.2e-05,0.0,-2.296875,-0.477783,0.195068,0.869141,0.869141
revives,4398809.0,0.000163,0.0,-0.387207,-0.387207,-0.387207,-0.387207,2.583984


(3958928, 20) (3958928,) (439881, 20) (439881,)




printing MAE result.. 0.07459244149790563
--------------------------------------------------
df: ../Data/processed\dfd_cle_RR_viz_pow_dr_enc.csv
Memory usage of dataframe is 704.77 MB
Memory usage after optimization is: 172.00 MB
Decreased by 75.6%
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
matchDuration      0
matchType          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPlacePerc       0
PC1                0
PC2                0
PC3                0
dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
assists,4398809.0,4.6e-05,0.0,-0.456787,-0.456787,-0.456787,-0.456787,2.205078
boosts,4398809.0,1.8e-05,0.0,-0.841797,-0.841797,-0.841797,1.157227,1.8125
damageDealt,4398809.0,3.1e-05,0.0,-1.394531,-1.394531,0.267334,0.76416,2.904297
DBNOs,4398809.0,3.2e-05,0.0,-0.765625,-0.765625,-0.765625,1.092773,1.804688
headshotKills,4398809.0,-4.7e-05,0.0,-0.446045,-0.446045,-0.446045,-0.446045,2.255859
heals,4398809.0,-0.000104,0.0,-0.793457,-0.793457,-0.793457,0.758301,1.783203
matchDuration,4398809.0,-7e-06,0.0,-2.472656,-0.589844,-0.035736,0.708984,1.935547
matchType,4398809.0,10.827492,5.329678,0.0,3.0,14.0,15.0,15.0
rankPoints,4398809.0,6.2e-05,0.0,-2.296875,-0.477783,0.195068,0.869141,0.869141
revives,4398809.0,0.000163,0.0,-0.387207,-0.387207,-0.387207,-0.387207,2.583984


(3958928, 20) (3958928,) (439881, 20) (439881,)




printing MAE result.. 0.07459244149790563
--------------------------------------------------
df: ../Data/processed\dfd_cle_RR_viz_pow_dr_enc_feng.csv
Memory usage of dataframe is 872.57 MB
Memory usage after optimization is: 213.95 MB
Decreased by 75.5%
assists                            0
boosts                             0
damageDealt                        0
DBNOs                              0
headshotKills                      0
heals                              0
matchDuration                      0
matchType                          0
rankPoints                         0
revives                            0
rideDistance                       0
roadKills                          0
swimDistance                       0
teamKills                          0
vehicleDestroys                    0
walkDistance                       0
weaponsAcquired                    0
winPlacePerc                       0
PC1                                0
PC2                                0
PC3   

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
assists,4398809.0,4.637241e-05,0.0,-0.456787,-0.456787,-0.456787,-0.456787,2.205078
boosts,4398809.0,1.841784e-05,0.0,-0.841797,-0.841797,-0.841797,1.157227,1.8125
damageDealt,4398809.0,3.069639e-05,0.0,-1.394531,-1.394531,0.267334,0.76416,2.904297
DBNOs,4398809.0,3.194809e-05,0.0,-0.765625,-0.765625,-0.765625,1.092773,1.804688
headshotKills,4398809.0,-4.696846e-05,0.0,-0.446045,-0.446045,-0.446045,-0.446045,2.255859
heals,4398809.0,-0.0001042485,0.0,-0.793457,-0.793457,-0.793457,0.758301,1.783203
matchDuration,4398809.0,-7.390976e-06,0.0,-2.472656,-0.589844,-0.035736,0.708984,1.935547
matchType,4398809.0,10.82749,5.329678,0.0,3.0,14.0,15.0,15.0
rankPoints,4398809.0,6.175041e-05,0.0,-2.296875,-0.477783,0.195068,0.869141,0.869141
revives,4398809.0,0.0001626015,0.0,-0.387207,-0.387207,-0.387207,-0.387207,2.583984


(3958928, 25) (3958928,) (439881, 25) (439881,)




printing MAE result.. 0.08190181572500112
--------------------------------------------------


In [130]:
MAE_diff_data_results

{'dfd_cle_RR.csv': 0.06728436807581147,
 'dfd_cle_RR_viz.csv': 0.067296075133743,
 'dfd_cle_RR_viz_pow.csv': 0.06730961542130266,
 'dfd_cle_RR_viz_pow_dr.csv': 0.07459244149790563,
 'dfd_cle_RR_viz_pow_dr_enc.csv': 0.07459244149790563,
 'dfd_cle_RR_viz_pow_dr_enc_feng.csv': 0.08190181572500112}

**NOTE** since the dfd_cle_RR_viz_pow.csv had very good results, and we want to keep the data relatively standaradize in scale, we decided to use the data before applying dimensionality reduction and adding feature engineering.

### training experiment 3

**NOTE** trying a different encoder

In [42]:
dfd_final = pd.read_csv('../Data/processed/dfd_cle_RR_viz_pow.csv')

In [43]:
dfd_final = reduce_mem_usage(dfd_final)

Memory usage of dataframe is 973.25 MB
Memory usage after optimization is: 343.99 MB
Decreased by 64.7%


In [50]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import glob

In [55]:
MAE_diff_data_results = {}

In [195]:
from sklearn.preprocessing import OneHotEncoder

def quick_clean2(df, df_name):
    if 'Unnamed: 0' in df.columns:
        print('there is index feature in the dataframe, removing it..')
        df = df.drop(columns=['Unnamed: 0'], axis=1)
        
    if ('Id' in df.columns) or ('groupId' in df.columns) or ('matchId' in df.columns):
        df = df.drop(columns=['Id', 'groupId', 'matchId'], axis=1)
    
    print(df.isnull().sum())
    df = df.fillna(0)
    
    if 'enc' in df_name:
        return df
    
    # encoding
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(df[['matchType']])
    new_arrs = enc.transform(df[['matchType']]).toarray()
    new_arrs = pd.DataFrame(data=new_arrs, columns = [f'matchType_dum {i}' for i in range(1, new_arrs.shape[1] + 1)])
    df = df.drop(columns=['matchType'], axis=1)
    df = pd.concat([df, new_arrs], axis=1)
    
    return df

In [196]:
for df_n in glob.glob('../Data/processed/dfd_cle_RR_viz_pow.csv'):
    print('df: {}'.format(df_n))
    name = df_n.split('\\')[-1] + '(one-hot-encoding)'
    try:
        df = pd.read_csv(df_n, dtype=dtypes)
    except:
        df = pd.read_csv(df_n)
        df = reduce_mem_usage(df)
    
    
    df = quick_clean2(df, df_n)
    display(df.describe().T)
    
    X = df.drop(['winPlacePerc'], axis=1)
    y = df['winPlacePerc']  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    model_lgb_1.fit(X_train, y_train)
    expected = y_test.reset_index(drop=True)
    predicted = model_lgb_1.predict(X_test)

    MAE = mean_absolute_error(expected, predicted)
    print('printing MAE result..', MAE)
    MAE_diff_data_results[name] = MAE
    
    print('-' * 50)

df: ../Data/processed/dfd_cle_RR_viz_pow.csv
Memory usage of dataframe is 973.25 MB
Memory usage after optimization is: 343.99 MB
Decreased by 64.7%
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
assists,4398809.0,4.6e-05,0.0,-0.456787,-0.456787,-0.456787,-0.456787,2.205078
boosts,4398809.0,1.8e-05,0.0,-0.841797,-0.841797,-0.841797,1.157227,1.8125
damageDealt,4398809.0,3.1e-05,0.0,-1.394531,-1.394531,0.267334,0.76416,2.904297
DBNOs,4398809.0,3.2e-05,0.0,-0.765625,-0.765625,-0.765625,1.092773,1.804688
headshotKills,4398809.0,-4.7e-05,0.0,-0.446045,-0.446045,-0.446045,-0.446045,2.255859
heals,4398809.0,-0.000104,0.0,-0.793457,-0.793457,-0.793457,0.758301,1.783203
killPlace,4398809.0,2e-06,0.0,-2.033203,-0.810059,0.095825,0.847168,1.731445
killPoints,4398809.0,-0.000114,0.0,-0.821777,-0.821777,-0.821777,1.208984,1.296875
kills,4398809.0,0.000149,0.0,-0.843262,-0.843262,-0.843262,0.805664,1.834961
killStreaks,4398809.0,-0.000202,0.0,-0.855957,-0.855957,-0.855957,1.025391,1.927734


(3958928, 40) (3958928,) (439881, 40) (439881,)




printing MAE result.. 0.07698106641991691
--------------------------------------------------


In [56]:
for df_n in glob.glob('../Data/processed/dfd_cle_RR_viz_pow.csv'):
    print('df: {}'.format(df_n))
    name = df_n.split('\\')[-1]
    try:
        df = pd.read_csv(df_n, dtype=dtypes)
    except:
        df = pd.read_csv(df_n)
        df = reduce_mem_usage(df)
    
    
    df = quick_clean(df, df_n)
    display(df.describe().T)
    
    X = df.drop(['winPlacePerc'], axis=1)
    y = df['winPlacePerc']  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    model_lgb_1.fit(X_train, y_train)
    expected = y_test.reset_index(drop=True)
    predicted = model_lgb_1.predict(X_test)

    MAE = mean_absolute_error(expected, predicted)
    print('printing MAE result..', MAE)
    MAE_diff_data_results[name] = MAE
    
    print('-' * 50)

df: ../Data/processed/dfd_cle_RR_viz_pow.csv
Memory usage of dataframe is 973.25 MB
Memory usage after optimization is: 343.99 MB
Decreased by 64.7%
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
assists,4398809.0,4.6e-05,0.0,-0.456787,-0.456787,-0.456787,-0.456787,2.205078
boosts,4398809.0,1.8e-05,0.0,-0.841797,-0.841797,-0.841797,1.157227,1.8125
damageDealt,4398809.0,3.1e-05,0.0,-1.394531,-1.394531,0.267334,0.76416,2.904297
DBNOs,4398809.0,3.2e-05,0.0,-0.765625,-0.765625,-0.765625,1.092773,1.804688
headshotKills,4398809.0,-4.7e-05,0.0,-0.446045,-0.446045,-0.446045,-0.446045,2.255859
heals,4398809.0,-0.000104,0.0,-0.793457,-0.793457,-0.793457,0.758301,1.783203
killPlace,4398809.0,2e-06,0.0,-2.033203,-0.810059,0.095825,0.847168,1.731445
killPoints,4398809.0,-0.000114,0.0,-0.821777,-0.821777,-0.821777,1.208984,1.296875
kills,4398809.0,0.000149,0.0,-0.843262,-0.843262,-0.843262,0.805664,1.834961
killStreaks,4398809.0,-0.000202,0.0,-0.855957,-0.855957,-0.855957,1.025391,1.927734


(3958928, 25) (3958928,) (439881, 25) (439881,)




printing MAE result.. 0.06730961542130266
--------------------------------------------------


In [57]:
MAE_diff_data_results

{'../Data/processed/dfd_cle_RR_viz_pow.csv': 0.06730961542130266}

### training experiment 4

trying different train/test splits

In [77]:
model_lgb_data = lgb.LGBMRegressor(objective='regression', num_leaves=50,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11, silent=True)

#### %80/%20  (train/test)

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [101]:
model_lgb_data.fit(X_train, y_train)





In [102]:
expected = y_test.reset_index(drop=True)
predicted = model_lgb_data.predict(X_test)

MAE = mean_absolute_error(expected, predicted)
print('model results')
print("MAE: %.5f" % MAE) 

model results
MAE: 0.06181


#### %95/%5  (train/test)

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [98]:
model_lgb_data.fit(X_train, y_train)





In [99]:
expected = y_test.reset_index(drop=True)
predicted = model_lgb_data.predict(X_test)

MAE = mean_absolute_error(expected, predicted)
print('model results')
print("MAE: %.5f" % MAE) 

model results
MAE: 0.06187


#### %99/%1 (train/test)

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

In [94]:
model_lgb_data.fit(X_train, y_train)





In [96]:
expected = y_test.reset_index(drop=True)
predicted = model_lgb_data.predict(X_test)

MAE = mean_absolute_error(expected, predicted)
print('model results')
print("MAE: %.5f" % MAE) 

model results
MAE: 0.06165


slight improvement in model performance when using %99/%1 train/test split. 

## 6- Final Results

- use light gradient boosting method (Lgbm)
- manually-tuned model (parameters from kaggle notebook)
- dfd_cle_RR_viz_pow.csv --> model data (note: cateogrical data needs to encoded)
- one-hot encoding is used
- 99% / 1% split is used

In [None]:
# visualizing in a plot
x_ax = range(len(y_test))
plt.figure(figsize=(12, 6))
plt.plot(x_ax, y_test, label="original")
plt.plot(x_ax, y_pred, label="predicted")
plt.title("Boston dataset test and predicted data")
plt.xlabel('X')
plt.ylabel('Price')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.grid(True)
plt.show()  

## References

- https://www.kaggle.com/code/serigne/stacked-regressions-top-4-on-leaderboard
- https://towardsdatascience.com/what-are-rmse-and-mae-e405ce230383#:~:text=Technically%2C%20RMSE%20is%20the%20Root,actual%20values%20of%20a%20variable.
- https://machinelearningmastery.com/regression-metrics-for-machine-learning/