# Import - Preprocesing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

dataset = pd.read_csv('jobfair_train.csv')
dataset = dataset.sort_values(by='league_id')
dataset = dataset.drop('registration_country', axis=1)
dataset = dataset.drop('registration_platform_specific', axis=1)
dataset = dataset.drop('season', axis=1)

In [None]:
'''Dynamic_Payment_Segment'''
new_dynamic = {'0) NonPayer': 0, '1) ExPayer': 1, '2) Minnow': 2, '3) Dolphin': 3, '4) Whale': 4}
dataset['dynamic_payment_segment'] = dataset['dynamic_payment_segment'].map(new_dynamic)
#print(dataset['dynamic_payment_segment'][:40])

'''Global_Competition_Level'''
dataset['global_competition_level'].fillna(0, inplace = True)
#print(dataset['global_competition_level'][:40])

In [None]:
print(dataset.shape)
print(dataset.isnull().any().any())

(55314, 20)
False


# Split of dataset on new train, val, test set

In [None]:
def train_val_test_split_fast(data):
    list_of_choices = np.random.choice([0,1,2], size = int(data.shape[0] / 14), p = [0.7, 0.15, 0.15])

    X = data.iloc[:,:].values
    train_set, val_set, test_set = [], [], []

    for i, choice in enumerate(list_of_choices):
        if choice == 0:
                train_set.append(X[14*i:(i+1)*14, :])
        elif choice == 1:
                val_set.append(X[14*i:(i+1)*14, :])
        else:
                test_set.append(X[14*i:(i+1)*14, :])

    train_set, val_set, test_set = np.concatenate(train_set), np.concatenate(val_set), np.concatenate(test_set)


    return train_set[:, :-1], train_set[:, -1], val_set[:, :-1], val_set[:, -1], test_set[:,:-1], test_set[:, -1]

#X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split_fast(dataset)

# Post-processing

Functions for **post-processing predictions**, and comparison of predictions **mae**

In [None]:
import numpy as np
ex = [2, 8.4, 7, 12, 6.1] # result = [1, 4, 3, 5, 2]
ex2 = [6, 5, 3, 8, 9] # result = [3 ,2, 1, 4, 5]

def one_post_process(y_p):
    y_p_copy = y_p[:]
    temp = np.argsort(np.array(y_p))
    for i in range(len(y_p)):
        y_p_copy[temp[i]] = i + 1
    return y_p_copy

#print(one_post_process(ex))
#print(one_post_process(ex2))
#print(ex)
#print(ex2)

In [None]:
import copy
example = ex + ex2
#print(example)

def post_sorting(y_pred, length):
    y_prediction = copy.deepcopy(y_pred)
    n = int(len(y_prediction) / length)

    for i in range(n):
        y_prediction[i*length:(i+1)*length] = one_post_process(y_prediction[i*length:(i+1)*length])

    return y_prediction

#print(post_sorting(example, 5))
#print(example)

In [None]:
y1 = [3,2,4,1]
y2 = [1,2,3,4]
y3 = [4,2,3,1]

def MAE(y, y_pred):
    s = 0
    for i in range(len(y)):
        s += abs(y[i] - y_pred[i])
    return s / len(y)

#print(MAE(y1,y2)); print(MAE(y1,y3)); print(MAE(y3,y2));

In [None]:
def MAX_MAE():
    l1 = list(range(14))
    l2 = l1[::-1]
    max_error = MAE(l1, l2)
    print(max_error)

MAX_MAE()

7.0


In [None]:
def average_MAE_error():
    repetition = 1000
    sum = 0
    for i in range(repetition):
        random_permutation1 = np.random.permutation(np.arange(1, 14 + 1))
        random_permutation2 = np.random.permutation(np.arange(1, 14 + 1))
        temp = MAE(random_permutation1, random_permutation2)
        sum+=temp
    print("Average MAE error: ", sum/repetition)

average_MAE_error()

Average MAE error:  4.6568571428571435


In [None]:
def safe_MAE_error():
    repetition = 1000
    sum = 0
    for i in range(repetition):
        random_permutation1 = np.random.permutation(np.arange(1, 14 + 1))
        random_permutation2 = np.random.choice(a = [7, 8], size = 14,  p = [0.5, 0.5])
        temp = MAE(random_permutation1, random_permutation2)
        sum+=temp
    print("Average MAE error: ", sum/repetition)

safe_MAE_error()

Average MAE error:  3.502714285714283


# Evaluation - Regression

In [None]:
from sklearn.metrics import mean_absolute_error
def evaluate_regressor(regressor, X_val_test, y_true, verbose = 1):
    y_pred = regressor.predict(X_val_test)
    y_pred_sorted = post_sorting(y_pred, 14)

    mae_val_test = mean_absolute_error(y_true, y_pred)
    mae_val_test_sorted = mean_absolute_error(y_true, y_pred_sorted)
    if verbose:
        print(f"Mean Absolute Error - predictions: {mae_val_test}")
        print(f"Mean Absolute Error - sorted prediction: {mae_val_test_sorted}")
    else:
        return mae_val_test, mae_val_test_sorted

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
def regressor_average_evaluation(regressor, data, poly = 0, scaling = 1, verbose = 1):
    sorted_average_val_test, average_val_test, average_train, sorted_average_train = 0,0,0,0

    n = 5
    for i in range(n):
        X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split_fast(data)
        if scaling:
            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_val = sc.transform(X_val)
            X_test = sc.transform(X_test)

        if poly:
            polynomial = PolynomialFeatures(degree = 2)
            X_train = polynomial.fit_transform(X_train)
            X_val = polynomial.transform(X_val)
            X_test = polynomial.transform(X_test)

        regressor.fit(X_train, y_train)

        temp_train, temp_train_sorted = evaluate_regressor(regressor, X_train, y_train, 0)
        temp_val, temp_val_sorted = evaluate_regressor(regressor, X_val, y_val, 0)
        temp_test, temp_test_sorted = evaluate_regressor(regressor, X_test, y_test, 0)

        average_train += temp_train
        sorted_average_train += temp_train_sorted
        average_val_test = average_val_test + temp_val + temp_test
        sorted_average_val_test = sorted_average_val_test + temp_val_sorted + temp_test_sorted

    if verbose:
        print("Average MAE - train prediction: ", average_train/n)
        print("Average MAE - sorted_train prediction: ", sorted_average_train/n)
        print("Average MAE - val_test prediction: ", average_val_test/n/2)
        print("Average MAE - sorted_val_test prediction: ", sorted_average_val_test/2/n)
    else:
        return (average_train/10, sorted_average_train/10, average_val_test/20, sorted_average_val_test/20)


# Regresion - template

Finding what works the best

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split_fast(dataset)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_val_scaled = sc.transform(X_val)
X_test_scaled = sc.transform(X_test)

## Multiple linear

In [None]:
from sklearn.linear_model import LinearRegression
l_regressor = LinearRegression()
l_regressor.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(l_regressor, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(l_regressor, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(l_regressor, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 2.5323825765471253
Mean Absolute Error - sorted prediction: 2.4599065084501976
validation set: 
Mean Absolute Error - predictions: 2.5372734768550367
Mean Absolute Error - sorted prediction: 2.487857142857143
test set: 
Mean Absolute Error - predictions: 2.5381076907619264
Mean Absolute Error - sorted prediction: 2.4784461152882207


In [None]:
regressor_average_evaluation(LinearRegression(), dataset)

Average MAE - train prediction:  2.5342722506366666
Average MAE - sorted_train prediction:  2.4704732724582583
Average MAE - val_test prediction:  2.5317387532837934
Average MAE - sorted_val_test prediction:  2.4653235194944783


## Polynomial

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2)
X_train_polynomial = poly.fit_transform(X_train_scaled)
p_regressor = LinearRegression()
p_regressor.fit(X_train_polynomial, y_train)

In [None]:
print("training set: ")
evaluate_regressor(p_regressor, poly.transform(X_train_scaled), y_train)
print("validation set: ")
evaluate_regressor(p_regressor, poly.transform(X_val_scaled), y_val)
print("test set: ")
evaluate_regressor(p_regressor, poly.transform(X_test_scaled), y_test)

training set: 
Mean Absolute Error - predictions: 2.4178014273879835
Mean Absolute Error - sorted prediction: 2.276378240233662
validation set: 
Mean Absolute Error - predictions: 2.4227267200107336
Mean Absolute Error - sorted prediction: 2.3089983022071308
test set: 
Mean Absolute Error - predictions: 2.4728675938249394
Mean Absolute Error - sorted prediction: 2.334097684017427


In [None]:
regressor_average_evaluation(LinearRegression(), dataset, poly = 1)

Average MAE - train prediction:  2.428622429754024
Average MAE - sorted_train prediction:  2.2988013734448476
Average MAE - val_test prediction:  2.7408785882248585
Average MAE - sorted_val_test prediction:  2.2974844185835495


## SVR

In [None]:
from sklearn.svm import SVR
regressor_svr = SVR(kernel = 'rbf')
regressor_svr.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(regressor_svr, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(regressor_svr, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(regressor_svr, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 2.2942395923130636
Mean Absolute Error - sorted prediction: 2.204362280878501
validation set: 
Mean Absolute Error - predictions: 2.3773764247262386
Mean Absolute Error - sorted prediction: 2.349040139616056
test set: 
Mean Absolute Error - predictions: 2.3641664945922822
Mean Absolute Error - sorted prediction: 2.2957301001581443


## Desition Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor()
tree_regressor.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(tree_regressor, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(tree_regressor, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(tree_regressor, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 0.0
Mean Absolute Error - sorted prediction: 0.0
validation set: 
Mean Absolute Error - predictions: 3.0396555905893767
Mean Absolute Error - sorted prediction: 2.980354111084162
test set: 
Mean Absolute Error - predictions: 3.045975693648246
Mean Absolute Error - sorted prediction: 3.008254987388214


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_regressor = RandomForestRegressor()
forest_regressor.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(forest_regressor, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(forest_regressor, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(forest_regressor, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 0.8373074101247249
Mean Absolute Error - sorted prediction: 0.38528456136673306
validation set: 
Mean Absolute Error - predictions: 2.277079734219269
Mean Absolute Error - sorted prediction: 2.239424141749723
test set: 
Mean Absolute Error - predictions: 2.238043103448276
Mean Absolute Error - sorted prediction: 2.1982758620689653


In [None]:
regressor_average_evaluation(RandomForestRegressor(), dataset)

Average MAE - train prediction:  0.8378484133664056
Average MAE - sorted_train prediction:  0.38142265156925415
Average MAE - val_test prediction:  2.250839354662491
Average MAE - sorted_val_test prediction:  2.2192816052006425


## XG-BOOST

In [None]:
from xgboost import XGBRegressor
xg_regressor = XGBRegressor()
xg_regressor.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(xg_regressor, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(xg_regressor, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(xg_regressor, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 1.7854937626693752
Mean Absolute Error - sorted prediction: 1.5781364636830522
validation set: 
Mean Absolute Error - predictions: 2.2565738776199527
Mean Absolute Error - sorted prediction: 2.2387596899224804
test set: 
Mean Absolute Error - predictions: 2.2095824867051115
Mean Absolute Error - sorted prediction: 2.1889162561576354


In [None]:
regressor_average_evaluation(XGBRegressor(), dataset)

Average MAE - train prediction:  1.789427088421571
Average MAE - sorted_train prediction:  1.5896532645713317
Average MAE - val_test prediction:  2.2299150324133796
Average MAE - sorted_val_test prediction:  2.201777188448527


In [None]:
feature_importances = xg_regressor.feature_importances_
column_names = dataset.columns.tolist()
column_names.remove('league_rank')
feature_importance_df = pd.DataFrame({'Feature': column_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

print(feature_importance_df)

                                    Feature  Importance
11                    playtime_last_28_days    0.338674
10               session_count_last_28_days    0.162261
12      league_match_won_count_last_28_days    0.074617
3                             cohort_season    0.066750
8                  days_active_last_28_days    0.057950
1                                 league_id    0.052451
5                  avg_stars_top_11_players    0.036034
6                  avg_stars_top_14_players    0.028760
13              training_count_last_28_days    0.025046
15                tokens_spent_last_28_days    0.021019
0                                   club_id    0.020250
4                    avg_age_top_11_players    0.017738
14                 global_competition_level    0.017075
7        avg_training_factor_top_11_players    0.015635
18                    morale_boosters_stash    0.014859
2                   dynamic_payment_segment    0.014054
16                             tokens_stash    0

# HyperParameter tunning - XG boost

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split_fast(dataset)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_val_scaled = sc.transform(X_val)
X_test_scaled = sc.transform(X_test)

In [None]:
from xgboost import XGBRegressor
xg_regressor = XGBRegressor()
xg_regressor.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(xg_regressor, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(xg_regressor, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(xg_regressor, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 1.772059161440969
Mean Absolute Error - sorted prediction: 1.568808301016665
validation set: 
Mean Absolute Error - predictions: 2.249059246978413
Mean Absolute Error - sorted prediction: 2.1890354868061874
test set: 
Mean Absolute Error - predictions: 2.2437429011454633
Mean Absolute Error - sorted prediction: 2.184254606365159


## Tuning structural hyperparameters

In [None]:
# Tree hyperparams
n_estimator_values = [30, 40, 55, 75, 85, 100]
max_depth_values = [4,5,6,7,8,9,10]
learning_rate_values = [0.05, 0.1, 0.12, 0.15]
min_child_weight = [1, 5, 10, 20, 50, 100]
subsample_values = [0.6, 0.7, 0.8, 0.9, 1.0]

params = {}
params_sorted = {}
for lr in learning_rate_values:
    for n in n_estimator_values:
        for sub in subsample_values:
            for depth in max_depth_values:
                for min in min_child_weight:
                    model = XGBRegressor(n_estimators = n, eta = lr, max_depth = depth, subsample  = sub, min_child_weight  = min)
                    model.fit(X_train_scaled, y_train)
                    t_val, t_val_sorted = evaluate_regressor(model, X_val_scaled, y_val, 0)
                    t_test, t_test_sorted = evaluate_regressor(model, X_test_scaled, y_test, 0)
                    params[(lr, n, sub, depth, min)] =  (t_val + t_test)/2
                    params_sorted[(lr, n, sub, depth, min)] =  (t_val_sorted + t_test_sorted)/2
params = dict(sorted(params.items(), key=lambda item: item[1], reverse=False))
params_sorted = dict(sorted(params_sorted.items(), key=lambda item: item[1], reverse=False))
print(params)
print(params_sorted)

## Tuning regularization hyperparameters

In [None]:
# Regularization hyperparams
reg_lambda_values = [0.1, 0.15, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
reg_alpha_values = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
gamma_values = [0, 0.01, 0.05, 0.1, 0.2, 0.5, 0.8, 1.5, 2, 3, 4]
learning_rate_values = [0.05, 0.08, 0.1, 0.12, 0.15, 0.2]

params2 = {}
params_sorted2 = {}
for lr in learning_rate_values:
    for lam in reg_lambda_values:
        for alpha in reg_alpha_values:
            for gamma in gamma_values:
                model = XGBRegressor(n_estimators = 100, max_depth = 7,\
                                     subsample  = 0.9, min_child_weight  = 50, gamma = gamma,\
                                     eta = lr, reg_lambda = lam, reg_alpha = alpha)
                model.fit(X_train_scaled, y_train)
                t_val, t_val_sorted = evaluate_regressor(model, X_val_scaled, y_val, 0)
                t_test, t_test_sorted = evaluate_regressor(model, X_test_scaled, y_test, 0)
                params2[(lr, lam, alpha, gamma)] =  (t_val + t_test)/2
                params_sorted2[(lr, lam, alpha, gamma)] =  (t_val_sorted + t_test_sorted)/2
params2 = dict(sorted(params2.items(), key=lambda item: item[1], reverse=False))
params_sorted2 = dict(sorted(params_sorted2.items(), key=lambda item: item[1], reverse=False))
print(params2)
print(params_sorted2)


In [None]:
best_model = XGBRegressor(reg_lambda = 0.1, learning_rate = 0.12, gamma = 0.1)
best_model.fit(X_train_scaled, y_train)
print("training set: ")
evaluate_regressor(best_model, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(best_model, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(best_model, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 2.024507701440614
Mean Absolute Error - sorted prediction: 1.8533167237620678
validation set: 
Mean Absolute Error - predictions: 2.1838717986559892
Mean Absolute Error - sorted prediction: 2.1301158301158303
test set: 
Average MAE - train prediction:  2.0062826482987695
Average MAE - sorted_train prediction:  1.8266563963268099
Average MAE - val_test prediction:  2.229400074263152
Average MAE - sorted_val_test prediction:  2.1670806222063055


In [None]:
regressor_average_evaluation(XGBRegressor(reg_lambda = 0.1, learning_rate = 0.12, gamma = 0.1), dataset)

Average MAE - train prediction:  2.0090269532250673
Average MAE - sorted_train prediction:  1.8296932705414601
Average MAE - val_test prediction:  2.2272284334588455
Average MAE - sorted_val_test prediction:  2.162692309422116


In [None]:
regressor_average_evaluation(XGBRegressor(reg_lambda = 0.4, learning_rate = 0.1, gamma = 0,reg_alpha = 0.3,\
                          subsample = 0.9, max_depth = 6, min_child_weight = 20, n_estimators = 85), dataset)

Average MAE - train prediction:  2.081430971169642
Average MAE - sorted_train prediction:  1.9244511238474709
Average MAE - val_test prediction:  2.233603041230944
Average MAE - sorted_val_test prediction:  2.156749363763219


In [None]:
regressor_average_evaluation(XGBRegressor(reg_lambda = 0.2, learning_rate = 0.1, gamma = 0,reg_alpha = 0.1,\
                          subsample = 0.9, max_depth = 6, min_child_weight = 20, n_estimators = 85), dataset)

Average MAE - train prediction:  2.085049953595433
Average MAE - sorted_train prediction:  1.9318278724788993
Average MAE - val_test prediction:  2.232768177968383
Average MAE - sorted_val_test prediction:  2.1659460811961884


In [None]:
regressor_average_evaluation(XGBRegressor(reg_lambda = 0.2, learning_rate = 0.1, gamma = 0,reg_alpha = 0,\
                          subsample = 0.9, max_depth = 7, min_child_weight = 50, n_estimators = 85), dataset)

Average MAE - train prediction:  2.044991320698957
Average MAE - sorted_train prediction:  1.8800928534864432
Average MAE - val_test prediction:  2.2045799147679217
Average MAE - sorted_val_test prediction:  2.1328549303493203


In [None]:
regressor_average_evaluation(XGBRegressor(reg_lambda = 0.2, learning_rate = 0.1, gamma = 0,reg_alpha = 0,\
                          subsample = 0.9, max_depth = 7, min_child_weight = 50, n_estimators = 85), dataset, scaling = 1)

Average MAE - train prediction:  2.039314265091163
Average MAE - sorted_train prediction:  1.8759819010284147
Average MAE - val_test prediction:  2.21082814391637
Average MAE - sorted_val_test prediction:  2.137104371572619


## Early stoping

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split_fast(dataset)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
X_test = sc.transform(X_test)

In [None]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'eta': 0.1,
    'max_depth': 7,
    'min_child_weight': 50,
    'subsample': 0.9,
    'nthread': 4,
}

early_stopping_rounds = 40

model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dval, 'val')],
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=True
)

y_pred_train = model.predict(dtrain, iteration_range=(0, 100))
y_pred_train_sorted = post_sorting(y_pred_train, 14)
y_pred_val = model.predict(dval, iteration_range=(0, 100))
y_pred_val_sorted = post_sorting(y_pred_val, 14)
y_pred_test = model.predict(dtest, iteration_range=(0, 100))
y_pred_test_sorted = post_sorting(y_pred_test, 14)

from sklearn.metrics import mean_absolute_error
print("train set")
mse_train = mean_absolute_error(y_train, y_pred_train)
mse_train_sorted = mean_absolute_error(y_train, y_pred_train_sorted)
print("pred: ", mse_train)
print("sorted pred: ", mse_train_sorted)

print("val set")
mse_val = mean_absolute_error(y_val, y_pred_val)
mse_val_sorted = mean_absolute_error(y_val, y_pred_val_sorted)
print("pred: ", mse_val)
print("sorted pred: ", mse_val_sorted)

print("test set")
mse_test = mean_absolute_error(y_test, y_pred_test)
mse_test_sorted = mean_absolute_error(y_test, y_pred_test_sorted)
print("pred: ", mse_test)
print("sorted pred: ", mse_test_sorted)