## 1- Imports

In [177]:
# import warnings
# warnings.filterwarnings("ignore")

In [178]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [179]:
history_df = pd.read_csv("../Data/processed/dfd_cle_RR_viz_pow_dr_enc_feng.csv")

In [180]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    
    # iterate through all the columns of a dataframe and modify the data type
    #   to reduce memory usage.        
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [181]:
history_df = reduce_mem_usage(history_df)

Memory usage of dataframe is 872.57 MB
Memory usage after optimization is: 213.95 MB
Decreased by 75.5%


## 2- Preprocessing

In [182]:
print(history_df.shape)
history_df_frac = history_df 
print(history_df_frac.shape)

(4398809, 26)
(4398809, 26)


In [183]:
X = history_df_frac.drop(['winPlacePerc'], axis=1)
y = history_df_frac['winPlacePerc']  

In [184]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3519047, 25), (3519047,), (879762, 25), (879762,))

## 3- Model Building 

**NOTE** even though we reduced the skewness of the data by removing a significant number of outliers, the data is still fairly skewed and there still number of data points that can be considered outliers, the main reason for not removing all the potential outliers was to not cut too much from the data as to affect the predictions made by the model.

In [185]:
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.kernel_ridge import KernelRidge

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

from sklearn.svm import SVR

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

from sklearn.neighbors import KNeighborsRegressor

# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import RobustScaler
# from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
# from sklearn.model_selection import KFold, cross_val_score, train_test_split
# from sklearn.metrics import mean_squared_error

# from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
# from sklearn.kernel_ridge import KernelRidge

In [186]:
#Validation function
n_folds = 5

rmse_cv = {}
acc_cv = {}

def rmsle_cv(model, model_name):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    acc = cross_val_score(model, X_train.values, y_train, cv = kf)
    rmse_cv[model_name] = rmse.mean()
    acc_cv[model_name] = acc.mean()
    
    return (rmse)

### 3.1 linear regression

this model is sensitive to outliers, so we use the sklearn's Robustscaler() method on pipeline to account for outliers. 

In [187]:
linear = make_pipeline(RobustScaler(), LinearRegression(fit_intercept=True))

In [188]:
score = rmsle_cv(linear, 'Linear Regression')
print("\nLinear score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Linear score: 0.1441 (0.0012)



### 3.2 Lasso Regression

this model is sensitive to outliers, so we use the sklearn's Robustscaler() method on pipeline to account for outliers. 

In [189]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=42))

In [190]:
score = rmsle_cv(lasso, 'Lasso Regression')
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.1381 (0.0001)



### 3.3 Elastic Net 

this model is sensitive to outliers, so we use the sklearn's Robustscaler() method on pipeline to account for outliers.

In [191]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [192]:
score = rmsle_cv(ENet, 'Elastic Net')
print("\nElastic Net score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Elastic Net score: 0.1380 (0.0001)



### 3.4 Kernel Ridge

Kernel ridge regression (KRR) combines ridge regression (linear least squares with l2-norm regularization) with the kernel trick. It thus learns a linear function in the space induced by the respective kernel and the data. 

In [193]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [194]:
# score = rmsle_cv(KRR)
# print("\nKernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** this model takes too much memory!

### 3.5 Gradient Boosting Regression

With huber loss that makes it robust to outliers

In [195]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42)

In [196]:
# score = rmsle_cv(GBoost)
# print("\nGradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** model takes too long in training!

### 3.6 XGBoost

In [197]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [198]:
# score = rmsle_cv(model_xgb)
# print("\nExtreme Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** model takes too long in training!

### 3.7 LightGBM

gradient boosting framework based on decision trees to increases the efficiency of the model and reduces memory usage

In [199]:
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=50,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11, silent=True)

In [200]:
# params = {
#         "objective" : "regression", 
#         "metric" : "mae", 
#         "num_leaves" : 149, 
#         "learning_rate" : 0.03, 
#         "bagging_fraction" : 0.9,
#         "bagging_seed" : 0, 
#         "num_threads" : 4,
#         "colsample_bytree" : 0.5,
#         'min_data_in_leaf':1900, 
#         'min_split_gain':0.00011,
#         'lambda_l2':9
# }

# model = lgb.train(  params, 
#                     train_set = train_set,
#                     num_boost_round=9400,
#                     early_stopping_rounds=200,
#                     verbose_eval=100, 
#                     valid_sets=[train_set,valid_set]
#                   )

In [201]:
score = rmsle_cv(model_lgb, 'Light Gradient Boosting')
print("\nLight Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Light Gradient Boosting score: 0.0796 (0.0001)



### 3.8 SVM regressor

In [202]:
SVMR = SVR(C=1.0, epsilon=0.2)

In [203]:
# score = rmsle_cv(SVMR)
# print("\nLight Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** model takes too long in training!

### 3.9 Decision Tree

In [204]:
DT = DecisionTreeRegressor(random_state=42)

In [205]:
score = rmsle_cv(DT, 'Decision Tree')
print("\nDecision Tree score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Decision Tree score: 0.1016 (0.0001)



### 3.10 random forest

In [206]:
randf = RandomForestRegressor(max_depth=2, random_state=42)

In [None]:
score = rmsle_cv(randf, 'Random Forest')
print("\nRandom Forest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** model takes too long in training!

### 3.11 Bayesian Regression 

In [None]:
kernel = DotProduct() + WhiteKernel()
gpr = GaussianProcessRegressor(kernel=kernel, random_state=42)

In [None]:
# score = rmsle_cv(gpr)
# print("\nBayesian Regression score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** model takes too much memory

### 3.12 KNN 

In [None]:
neigh = KNeighborsRegressor(n_neighbors=5)

In [None]:
# score = rmsle_cv(neigh)
# print("\nK-Nearest Neighbors score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**NOTE** takes too long to run

## 4- Model Comparisons

**metrics to use**
- root mean squared error
- mean squared error
- mean absolute error

In [None]:
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
eval_scores = {}

def show_rmse(predicted, expected):
    # calculate errors
    errors = list()
    for i in range(len(expected)):
        # calculate error
        err = (expected[i] - predicted[i])**2
        # store error
        errors.append(err)
    #     # report error
    #     print('>%.1f, %.1f = %.3f' % (expected[i], predicted[i], err))

    # plot errors
    plt.plot(errors)
    plt.xticks(ticks=[i for i in range(len(errors))], labels=predicted)
    plt.xlabel('Predicted Value')
    plt.ylabel('Mean Squared Error')
    plt.show()

def evaluating_predictions(model, model_name):
    model.fit(X_train, y_train)
    
    expected = y_test.reset_index(drop=True)
    predicted = model.predict(X_test)
    
    RMSE = mean_squared_error(expected, predicted, squared=False)
    MSE = mean_squared_error(expected, predicted)
    MAE = mean_absolute_error(expected, predicted)
    
    global eval_scores
    eval_scores[model_name] = {}
    eval_scores[model_name]['RMSE'] = RMSE
    eval_scores[model_name]['MSE'] = MSE
    eval_scores[model_name]['MAE'] = MAE
    
    print('model {} ---> RMSE: {:.5f} / MSE: {:.5f} / MAE: {:.5f}'
          .format(model_name, RMSE, MSE, MAE))
    
    
    

In [None]:
evaluating_predictions(linear, 'Linear Regression')
evaluating_predictions(lasso, 'Lasso Regression')
evaluating_predictions(ENet, 'Elastic Net')
# evaluating_predictions(KRR, 'Kernel Ridge Regression')  # takes too much memory
# evaluating_predictions(GBoost, 'Gradient Boosting')  # takes too long
# evaluating_predictions(model_xgb, 'Extreme Gradient Boosting')  # takes too long
evaluating_predictions(model_lgb, 'Light Gradient Boosting')
# evaluating_predictions(SVMR, 'Support Vector Machine')  # takes too long
evaluating_predictions(DT, 'Decision Tree')
evaluating_predictions(randf, 'Random Forest')
# evaluating_predictions(gpr, 'Gaussian Process Regressor')  # takes too much memory
# evaluating_predictions(neigh, 'K-Nearest Neigbors')  # takes too long

In [None]:
# making of dataframe for clearer view of overall results
models = []
RMSE = []
MSE = []
MAE = []
RMSE_cv = []
ACC_cv = []

for models_name, eval_score in eval_scores.items():
    models.append(models_name)
    RMSE.append(eval_score['RMSE'])
    MSE.append(eval_score['MSE'])
    MAE.append(eval_score['MAE'])

for models_name, eval_score_cv in rmse_cv.items():
    RMSE_cv.append(rmse_cv[models_name])
    
for models_name, eval_score_cv in acc_cv.items():
    ACC_cv.append(acc_cv[models_name])

eval_results_df = df = pd.DataFrame(list(zip(RMSE, MSE, MAE, RMSE_cv)),
               columns =['RMSE', 'MSE', 'MAE', 'RMSE_cv'], index=models)

eval_results_df

## References

- https://www.kaggle.com/code/serigne/stacked-regressions-top-4-on-leaderboard
- https://towardsdatascience.com/what-are-rmse-and-mae-e405ce230383#:~:text=Technically%2C%20RMSE%20is%20the%20Root,actual%20values%20of%20a%20variable.
- https://machinelearningmastery.com/regression-metrics-for-machine-learning/