# Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

In [2]:
%%capture
# this is github gist URL, there are all functions and imports I need to reuse often across many notebooks
!wget -O package.py https://gist.githubusercontent.com/Jankoetf/c36cc24ddd83d4194148a86f87efd397/raw/adf1e7c72dfe8db685ad936f8882d42ac85ae5b7/package1.py
import package

In [3]:
dataset = pd.read_csv('jobfair_train.csv')
dataset = dataset.sort_values(by='league_id')
dataset = package.basic_preprocessing(dataset)
dataset = package.Feature_Selection(dataset)
dataset = package.Averaging_by_leagues(dataset)
print(dataset.shape)

(55314, 18)


# Evaluation - Regression

The Idea here is to make useful functions to manualy validate model performance

In [4]:
from sklearn.metrics import mean_absolute_error
def evaluate_regressor(regressor, X_val_test, y_true, verbose = 1):
    y_pred = regressor.predict(X_val_test)
    y_pred_sorted = package.post_processing_1(y_pred)

    mae_val_test = mean_absolute_error(y_true, y_pred)
    mae_val_test_sorted = mean_absolute_error(y_true, y_pred_sorted)
    if verbose:
        print(f"Mean Absolute Error - predictions: {mae_val_test}")
        print(f"Mean Absolute Error - sorted prediction: {mae_val_test_sorted}")
    else:
        return mae_val_test, mae_val_test_sorted

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
def regressor_average_evaluation(regressor, data, poly = 0, scaling = 1, verbose = 1):
    sorted_average_val_test, average_val_test, average_train, sorted_average_train = 0,0,0,0

    n = 5
    for i in range(n):
        X_train, y_train, X_val, y_val, X_test, y_test = package.train_val_test_split_adapted_shuffled(data)
        if scaling:
            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_val = sc.transform(X_val)
            X_test = sc.transform(X_test)

        if poly:
            polynomial = PolynomialFeatures(degree = 2)
            X_train = polynomial.fit_transform(X_train)
            X_val = polynomial.transform(X_val)
            X_test = polynomial.transform(X_test)

        regressor.fit(X_train, y_train)

        temp_train, temp_train_sorted = evaluate_regressor(regressor, X_train, y_train, 0)
        temp_val, temp_val_sorted = evaluate_regressor(regressor, X_val, y_val, 0)
        temp_test, temp_test_sorted = evaluate_regressor(regressor, X_test, y_test, 0)

        average_train += temp_train
        sorted_average_train += temp_train_sorted
        average_val_test = average_val_test + temp_val + temp_test
        sorted_average_val_test = sorted_average_val_test + temp_val_sorted + temp_test_sorted

    if verbose:
        print("Average MAE - train prediction: ", average_train/n)
        print("Average MAE - sorted_train prediction: ", sorted_average_train/n)
        print("Average MAE - val_test prediction: ", average_val_test/n/2)
        print("Average MAE - sorted_val_test prediction: ", sorted_average_val_test/2/n)
    else:
        return (average_train/10, sorted_average_train/10, average_val_test/20, sorted_average_val_test/20)


# Regresion - model selection

Finding what model works the best for task using custom shuffled train test split.

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = package.train_val_test_split_adapted_shuffled(dataset)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_val_scaled = sc.transform(X_val)
X_test_scaled = sc.transform(X_test)

## Multiple linear

In [None]:
from sklearn.linear_model import LinearRegression
l_regressor = LinearRegression()
l_regressor.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(l_regressor, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(l_regressor, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(l_regressor, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 2.254419323542679
Mean Absolute Error - sorted prediction: 2.1691512877191177
validation set: 
Mean Absolute Error - predictions: 2.285212251000148
Mean Absolute Error - sorted prediction: 2.1963019250253293
test set: 
Mean Absolute Error - predictions: 2.232712743813559
Mean Absolute Error - sorted prediction: 2.150140977443609


In [None]:
regressor_average_evaluation(LinearRegression(), dataset)

Average MAE - train prediction:  2.257398917199077
Average MAE - sorted_train prediction:  2.1729532661633737
Average MAE - val_test prediction:  2.2572779246148857
Average MAE - sorted_val_test prediction:  2.1715762099532667


## Polynomial

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2)
X_train_polynomial = poly.fit_transform(X_train_scaled)
p_regressor = LinearRegression()
p_regressor.fit(X_train_polynomial, y_train)

In [None]:
print("training set: ")
evaluate_regressor(p_regressor, poly.transform(X_train_scaled), y_train)
print("validation set: ")
evaluate_regressor(p_regressor, poly.transform(X_val_scaled), y_val)
print("test set: ")
evaluate_regressor(p_regressor, poly.transform(X_test_scaled), y_test)

training set: 
Mean Absolute Error - predictions: 2.0151664615418516
Mean Absolute Error - sorted prediction: 1.977329974811083
validation set: 
Mean Absolute Error - predictions: 2.058279489042955
Mean Absolute Error - sorted prediction: 2.019756838905775
test set: 
Mean Absolute Error - predictions: 1.9979731927489413
Mean Absolute Error - sorted prediction: 1.9544172932330828


In [None]:
regressor_average_evaluation(LinearRegression(), dataset, poly = 1)

Average MAE - train prediction:  2.0159452474862265
Average MAE - sorted_train prediction:  1.9777892122162357
Average MAE - val_test prediction:  2.0303246022449573
Average MAE - sorted_val_test prediction:  1.9908139799721059


## SVR

In [None]:
from sklearn.svm import SVR
regressor_svr = SVR(kernel = 'rbf')
regressor_svr.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(regressor_svr, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(regressor_svr, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(regressor_svr, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 1.873715656116285
Mean Absolute Error - sorted prediction: 1.855523569629363
validation set: 
Mean Absolute Error - predictions: 1.9746886022115688
Mean Absolute Error - sorted prediction: 1.9550405268490374
test set: 
Mean Absolute Error - predictions: 1.9032316446210489
Mean Absolute Error - sorted prediction: 1.8910949248120301


In [None]:
regressor_average_evaluation(SVR(kernel = 'rbf'), dataset)

Average MAE - train prediction:  1.877584328645657
Average MAE - sorted_train prediction:  1.8584393106813875
Average MAE - val_test prediction:  1.9332973221882994
Average MAE - sorted_val_test prediction:  1.91669004130456


## Desition Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor()
tree_regressor.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(tree_regressor, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(tree_regressor, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(tree_regressor, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 0.0
Mean Absolute Error - sorted prediction: 0.0
validation set: 
Mean Absolute Error - predictions: 2.685030395136778
Mean Absolute Error - sorted prediction: 2.685030395136778
test set: 
Mean Absolute Error - predictions: 2.686795112781955
Mean Absolute Error - sorted prediction: 2.686795112781955


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_regressor = RandomForestRegressor()
forest_regressor.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(forest_regressor, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(forest_regressor, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(forest_regressor, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 0.7216041227574153
Mean Absolute Error - sorted prediction: 0.6792011514933429
validation set: 
Mean Absolute Error - predictions: 1.987359422492401
Mean Absolute Error - sorted prediction: 1.9769503546099292
test set: 
Mean Absolute Error - predictions: 1.9281379229323306
Mean Absolute Error - sorted prediction: 1.9185855263157894


In [None]:
regressor_average_evaluation(RandomForestRegressor(), dataset)

Average MAE - train prediction:  0.7247359513359279
Average MAE - sorted_train prediction:  0.6838166524611686
Average MAE - val_test prediction:  1.945489843355265
Average MAE - sorted_val_test prediction:  1.9318089785875892


## XG-BOOST

In [None]:
from xgboost import XGBRegressor
xg_regressor = XGBRegressor()
xg_regressor.fit(X_train_scaled, y_train)

In [None]:
print("training set: ")
evaluate_regressor(xg_regressor, X_train_scaled, y_train)
print("validation set: ")
evaluate_regressor(xg_regressor, X_val_scaled, y_val)
print("test set: ")
evaluate_regressor(xg_regressor, X_test_scaled, y_test)

training set: 
Mean Absolute Error - predictions: 1.5337094390250745
Mean Absolute Error - sorted prediction: 1.5139567161877345
validation set: 
Mean Absolute Error - predictions: 1.9980690440690483
Mean Absolute Error - sorted prediction: 1.9829027355623101
test set: 
Mean Absolute Error - predictions: 1.9334708683409805
Mean Absolute Error - sorted prediction: 1.916000939849624


In [None]:
regressor_average_evaluation(XGBRegressor(), dataset)

Average MAE - train prediction:  1.5316672278955892
Average MAE - sorted_train prediction:  1.5095649449541642
Average MAE - val_test prediction:  1.9708875848172311
Average MAE - sorted_val_test prediction:  1.9531737101420652


In [None]:
feature_importances = xg_regressor.feature_importances_
column_names = dataset.columns.tolist()
column_names.remove('league_rank')
feature_importance_df = pd.DataFrame({'Feature': column_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

print(feature_importance_df)

                                             Feature  Importance
4                  averaged_avg_stars_top_11_players    0.481538
8                averaged_session_count_last_28_days    0.160129
9                     averaged_playtime_last_28_days    0.051086
10      averaged_league_match_won_count_last_28_days    0.043594
6                  averaged_days_active_last_28_days    0.034618
11              averaged_training_count_last_28_days    0.030511
13                averaged_tokens_spent_last_28_days    0.027585
0                                          league_id    0.025943
12                 averaged_global_competition_level    0.023918
3                    averaged_avg_age_top_11_players    0.018592
2                   averaged_dynamic_payment_segment    0.017873
5        averaged_avg_training_factor_top_11_players    0.016951
1                                   averaged_club_id    0.016572
16                    averaged_morale_boosters_stash    0.013321
14                       

So we see that **SVR** model works the best here.
Next is hyperparameter tunning for it.

# HyperParameter tunning - SVR Regessor

In [6]:
dataset = pd.read_csv('jobfair_train.csv')
dataset = dataset.sort_values(by='league_id')
dataset = package.basic_preprocessing(dataset)
dataset = package.Feature_Selection(dataset)
dataset = package.Averaging_by_leagues(dataset)
print(dataset.shape)

(55314, 18)


In [7]:
X_train, y_train, X_val, y_val, X_test, y_test = package.train_val_test_split_adapted_shuffled(dataset)

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_val_scaled = sc.transform(X_val)
X_test_scaled = sc.transform(X_test)

## Tuning C, epsilon

In [21]:
C_values = [0.5, 1, 2]
epsilon_values = [0.005, 0.01, 0.05]

from sklearn.svm import SVR
params2 = {}
params_processed2 = {}
for c in C_values:
    for e in epsilon_values:
        model = SVR(kernel = 'rbf', epsilon = e, C = c, gamma = 0.05)
        model.fit(X_train_scaled, y_train)
        t_val, t_val_sorted = evaluate_regressor(model, X_val_scaled, y_val, 0)
        t_test, t_test_sorted = evaluate_regressor(model, X_test_scaled, y_test, 0)
        params2[(c, e)] =  (t_val + t_test)/2
        params_processed2[(c, e)] =  (t_val_sorted + t_test_sorted)/2
params2 = dict(sorted(params2.items(), key=lambda item: item[1], reverse=False))
params_processed2 = dict(sorted(params_processed2.items(), key=lambda item: item[1], reverse=False))

In [22]:
print(list(params2.keys())[:3])
print(list(params2.values())[:3])
print(list(params_processed2.keys())[:3])
print(list(params_processed2.values())[:3])

[(2, 0.05), (2, 0.01), (2, 0.005)]
[1.9066949010115202, 1.9068747502939614, 1.9068949778943196]
[(2, 0.05), (2, 0.01), (2, 0.005)]
[1.8885044856481632, 1.8887478880953625, 1.889285962866937]


## Tuning gamma

In [18]:
gamma_values = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 4, 8]
params = {}
params_processed = {}
for g in gamma_values:
    model = SVR(kernel = 'rbf', gamma = g, epsilon = 0.01)
    model.fit(X_train_scaled, y_train)
    t_val, t_val_sorted = evaluate_regressor(model, X_val_scaled, y_val, 0)
    t_test, t_test_sorted = evaluate_regressor(model, X_test_scaled, y_test, 0)
    params[(g)] =  (t_val + t_test)/2
    params_processed[(g)] =  (t_val_sorted + t_test_sorted)/2

params = dict(sorted(params.items(), key=lambda item: item[1], reverse=False))
params_processed = dict(sorted(params_processed.items(), key=lambda item: item[1], reverse=False))

In [20]:
print(list(params.keys())[:3])
print(list(params.values())[:3])
print(list(params_processed.keys())[:3])
print(list(params_processed.values())[:3])

[0.05, 0.1, 0.01]
[1.9127166225389458, 1.9267398010642358, 1.9447367817044636]
[0.05, 0.1, 0.01]
[1.8978793121591848, 1.9107389366855339, 1.9229617789798061]


In [23]:
regressor_average_evaluation(SVR(kernel = 'rbf', gamma = 0.05, epsilon = 0.05, C = 2), dataset)

Average MAE - train prediction:  1.8488576828024281
Average MAE - sorted_train prediction:  1.830485603353841
Average MAE - val_test prediction:  1.9378559862133664
Average MAE - sorted_val_test prediction:  1.9213776125839939


In [24]:
regressor_average_evaluation(SVR(kernel = 'rbf', gamma = 0.05, epsilon = 0.1, C = 0.5), dataset)

Average MAE - train prediction:  1.9162851725586356
Average MAE - sorted_train prediction:  1.899497357167784
Average MAE - val_test prediction:  1.9480075254326839
Average MAE - sorted_val_test prediction:  1.9323394014269084


In [26]:
regressor_average_evaluation(SVR(kernel = 'rbf', gamma = 0.05, epsilon = 0.15, C = 0.3), dataset)

Average MAE - train prediction:  1.9415367039639904
Average MAE - sorted_train prediction:  1.924877506704838
Average MAE - val_test prediction:  1.9528483088623705
Average MAE - sorted_val_test prediction:  1.936295166949586


This way MAE is oround 1.93