In [3]:
from smart_open import open
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize

%load_ext autoreload
%autoreload 2

# Load data for all positions

In [159]:
data = pd.read_csv("../data/training_pruned_file_20162021.csv")
players = data

In [160]:
data["league_name"].unique()

array(['Premier League', 'Ligue 1', 'Bundesliga', 'Serie A', 'La Liga'],
      dtype=object)

## Select variables

In [161]:
players_selected_vars = players[[
    "name",
    "player_id_api",
    "date_of_birth",
    "nationality",
    "team_name",
    'league_name',
    'league.season',
    "games.appearences",
    "games.lineups",
    "games.minutes",
    # "goals.conceded",
    # "goals.saves",
    # "penalty.saved",
    # "highest_market_value_in_eur",
    "market_value_in_eur_x",
    # "own_goals.total",
    "position",
    "position_name_y",
    "height_x",
    "accurate_crosses.total",
    "accurate_passes.total",
    "aerials_won.total",
    "big_chances_created.total",
    "big_chances_missed.total",
    "blocked_shots.total",
    "clearances.total",
    "dribbled_past.total",
    "duels_won.total",
    "error_lead_to_goal.total",
    "fouls.total",
    "goals.goals",
    "interceptions.total",
    "offsides.total",
    "redcards.away",
    "redcards.home",
    "redcards.total",
    "saves_inside_box.total",
    "shots_off_target.total",
    "tackles.total_x",
    "through_balls.total",
    "total_crosses.total",
    "total_duels.total",
    "yellowcards.away",
    "yellowcards.home",
    "yellowcards.total",
    "yellowred_cards.away",
    "yellowred_cards.home",
    "yellowred_cards.total",
    "shots.total",
    "shots.on",
    "goals.total_y",
    "goals.conceded",
    "goals.assists",
    "goals.saves",
    "passes.total_y",
    "passes.key",
    "passes.accuracy",
    "tackles.total_y",
    "tackles.blocks",
    "tackles.interceptions",
    "duels.total",
    "duels.won",
    "dribbles.attempts",
    "dribbles.success",
    "dribbles.past",
    "fouls.committed",
    "cards.yellow",
    "cards.yellowred",
    "cards.red",
    "penalty.commited",
    "penalty.saved",
]].copy().rename({
    "team_name": "team",
    "league_name": "league",
    "league.season": "season",
    "games.appearences": "matches_played",
    "games.lineups": "matches_started",
    "games.minutes": "minutes",
    # "goals.conceded": "goals_against",
    # "goals.saves": "saves",
    # "saves_inside_box.total": "saves_inside_box",
    # "penalty.saved": "penalty_kicks_saves",
    # "own_goals.total": "own_goals_against",
    # "error_lead_to_goal.total": "errors_to_goal",
    "market_value_in_eur_x": "market_value",
}, axis=1)
players_selected_vars["90s_played"] = players_selected_vars["minutes"] / 90
players_selected_vars["age"] = pd.to_datetime(players_selected_vars["date_of_birth"]).map(lambda x: int((pd.Timestamp.today() - x).days / 365))
for per_90 in [
    # "shots_on_target_against",
    # "saves_inside_box",
    # "saves",
    # "penalty_kicks_saves",
    # "own_goals_against",
    # "goals_against",
    # "accurate_crosses.total",
    # "big_chances_created.total",
    # "big_chances_missed.total",
    # "blocked_shots.total",
    # "dribbled_past.total",
    # "goals.goals",
    # "offsides.total",
    # "shots_off_target.total",
    # "through_balls.total",
    # "total_crosses.total",
    # "shots.total",
    # "shots.on",
    # "goals.total_y",
    # "goals.assists",
    # "passes.key",
    # "dribbles.attempts",
    # "dribbles.success",
    # "dribbles.past",
]:
    players_selected_vars[per_90] = players_selected_vars[per_90] / players_selected_vars["90s_played"]


players_selected_vars.replace([np.inf, -np.inf], np.nan, inplace=True)
    
players_selected_vars = players_selected_vars.drop([
    "date_of_birth",
    "minutes",
], axis=1)

order = [
    "name",
    "player_id_api",
    "age",
    "nationality",
    "league",
    "team",
    "season",
    "matches_played",
    "matches_started",
    # "minutes",
    "90s_played",
    # "goals_against",
    # "own_goals_against",
    # "shots_on_target_against",
    # "saves",
    # "save%",
    # "saves_inside_box",
    # "penalty_kicks_saves",
    "market_value",
    "position",
    "position_name_y",
    "height_x",
    "accurate_crosses.total",
    "accurate_passes.total",
    "aerials_won.total",
    "big_chances_created.total",
    "big_chances_missed.total",
    "blocked_shots.total",
    "clearances.total",
    "dribbled_past.total",
    "duels_won.total",
    "error_lead_to_goal.total",
    "fouls.total",
    "goals.goals",
    "interceptions.total",
    "offsides.total",
    "redcards.away",
    "redcards.home",
    "redcards.total",
    "saves_inside_box.total",
    "shots_off_target.total",
    "tackles.total_x",
    "through_balls.total",
    "total_crosses.total",
    "total_duels.total",
    "yellowcards.away",
    "yellowcards.home",
    "yellowcards.total",
    "yellowred_cards.away",
    "yellowred_cards.home",
    "yellowred_cards.total",
    "shots.total",
    "shots.on",
    "goals.total_y",
    "goals.conceded",
    "goals.assists",
    "goals.saves",
    "passes.total_y",
    "passes.key",
    "passes.accuracy",
    "tackles.total_y",
    "tackles.blocks",
    "tackles.interceptions",
    "duels.total",
    "duels.won",
    "dribbles.attempts",
    "dribbles.success",
    "dribbles.past",
    "fouls.committed",
    "cards.yellow",
    "cards.yellowred",
    "cards.red",
    "penalty.commited",
    "penalty.saved",
]
players_selected_vars = players_selected_vars[order + list(set(players_selected_vars.columns).difference(order))]
players_selected_vars

Unnamed: 0,name,player_id_api,age,nationality,league,team,season,matches_played,matches_started,90s_played,...,duels.won,dribbles.attempts,dribbles.success,dribbles.past,fouls.committed,cards.yellow,cards.yellowred,cards.red,penalty.commited,penalty.saved
0,George Thomas,20479,26,Wales,Premier League,Leicester City,2017,0.0,0.0,0.000000,...,,,,,,0.0,0.0,0.0,,
1,Harvey Lewis Barnes,18778,25,England,Premier League,Leicester City,2017,3.0,0.0,0.077778,...,1.0,0.0,0.0,,1.0,0.0,0.0,0.0,,
2,Andy King,19217,34,Wales,Premier League,Leicester City,2017,11.0,9.0,7.977778,...,24.0,4.0,1.0,,5.0,1.0,0.0,0.0,,
3,Marc Albrighton,18777,33,England,Premier League,Leicester City,2017,34.0,30.0,28.266667,...,140.0,95.0,52.0,,20.0,5.0,0.0,1.0,1.0,
4,Kelechi Promise Iheanacho,2778,26,Nigeria,Premier League,Leicester City,2017,21.0,7.0,9.144444,...,37.0,18.0,13.0,,17.0,2.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14768,Matěj Vydra,18930,31,Czech Republic,Premier League,Burnley,2021,22.0,5.0,7.400000,...,44.0,14.0,6.0,,9.0,1.0,0.0,0.0,,
14769,Josh Brownhill,19268,27,England,Premier League,Burnley,2021,35.0,32.0,32.811111,...,155.0,30.0,15.0,,39.0,10.0,0.0,0.0,,
14770,Ashley Barnes,18927,33,England,Premier League,Burnley,2021,23.0,8.0,8.888889,...,77.0,8.0,5.0,,21.0,4.0,0.0,0.0,,
14771,Wout Weghorst,25416,30,Netherlands,Premier League,Burnley,2021,18.0,17.0,17.244444,...,96.0,8.0,7.0,,17.0,2.0,0.0,0.0,,


In [162]:

from helpers import TEAM_TIERS

In [163]:
players_selected_vars = pd.merge(players_selected_vars, TEAM_TIERS, left_on="team", right_index=True)

In [164]:
players_selected_vars.loc[players_selected_vars["tier"] == "-1", "tier"] = None

In [165]:
for c in players_selected_vars.columns:
    players_selected_vars[f"has_{c}"] = (~players_selected_vars[c].isnull()).astype(int)

In [166]:
ddf = players_selected_vars[players_selected_vars.player_id_api == 8][['name', 'player_id_api', 'team', 'league', 'season', 'market_value']]
ddf = players_selected_vars

In [167]:
ddf[ddf.player_id_api == 8]

Unnamed: 0,name,player_id_api,age,nationality,league,team,season,matches_played,matches_started,90s_played,...,has_dribbles.attempts,has_dribbles.success,has_dribbles.past,has_fouls.committed,has_cards.yellow,has_cards.yellowred,has_cards.red,has_penalty.commited,has_penalty.saved,has_tier
290,Raphaël Adelino José Guerreiro,8,29,Portugal,Bundesliga,Borussia Dortmund,2017,9.0,6.0,6.077778,...,1,1,0,1,1,1,1,0,0,1
2907,Raphaël Adelino José Guerreiro,8,29,Portugal,Bundesliga,Borussia Dortmund,2018,23.0,17.0,17.011111,...,1,1,0,1,1,1,1,0,0,1
8038,Raphaël Adelino José Guerreiro,8,29,Portugal,Bundesliga,Borussia Dortmund,2019,29.0,26.0,26.0,...,1,1,0,1,1,1,1,0,0,1
10542,Raphaël Adelino José Guerreiro,8,29,Portugal,Bundesliga,Borussia Dortmund,2020,27.0,25.0,24.688889,...,1,1,0,1,1,1,1,0,0,1
13936,Raphaël Adelino José Guerreiro,8,29,Portugal,Bundesliga,Borussia Dortmund,2021,23.0,21.0,20.033333,...,1,1,0,1,1,1,1,0,0,1


In [168]:
mvs = players_selected_vars["market_value"].values
mvs = np.expand_dims(mvs, axis=1)
mvs = normalize(mvs, norm="max", axis=0)
mvs = np.squeeze(mvs)
players_selected_vars["normalised_market_value"] = mvs

In [169]:
players_selected_vars[players_selected_vars.player_id_api == 8]["normalised_market_value"]

290      0.105
2907     0.125
8038     0.175
10542    0.200
13936    0.125
Name: normalised_market_value, dtype: float64

In [193]:

numerical_features = [
    "age",
    "accurate_crosses.total",
    "accurate_passes.total",
    "aerials_won.total",
    "big_chances_created.total",
    "big_chances_missed.total",
    "blocked_shots.total",
    "clearances.total",
    "dribbled_past.total",
    "duels_won.total",
    "error_lead_to_goal.total",
    "fouls.total",
    "goals.goals",
    "interceptions.total",
    "offsides.total",
    "redcards.away",
    "redcards.home",
    "redcards.total",
    "saves_inside_box.total",
    "shots_off_target.total",
    "tackles.total_x",
    "through_balls.total",
    "total_crosses.total",
    "total_duels.total",
    "yellowcards.away",
    "yellowcards.home",
    "yellowcards.total",
    "yellowred_cards.away",
    "yellowred_cards.home",
    "yellowred_cards.total",
    "shots.total",
    "shots.on",
    "goals.total_y",
    "goals.conceded",
    "goals.assists",
    "goals.saves",
    "passes.total_y",
    "passes.key",
    "passes.accuracy",
    "tackles.total_y",
    "tackles.blocks",
    "tackles.interceptions",
    "duels.total",
    "duels.won",
    "dribbles.attempts",
    "dribbles.success",
    "dribbles.past",
    "fouls.committed",
    "cards.yellow",
    "cards.yellowred",
    "cards.red",
    "penalty.commited",
    "penalty.saved",
    "market_value",
    "normalised_market_value",
]

categorical_features = [
    "team",
    "tier",
    "position",
    "position_name_y",
    "league"
]

In [208]:
def delta(x):
    if len(x) == 1:
        return 0.0
    return x[-1] / np.mean(x[:-1]) - 1

def diff(x):
    if len(x) == 1:
        return 0.0
    return float(x[-1] != x[-2])


def current(x):
    return x[-1]

current([3,2,1])

1

In [229]:
def delta(x):
    x = x.tolist()
    return (
        x[-1] / np.mean(x[:-1]) - 1 
        if len(x) > 1 else 0.0
    )
    
def diff(x):
    x = x.tolist()
    return (
        x[-1] != x[-2] 
        if len(x) > 1 else 0.0
    )

def current(x):
    return x.tolist()[-1]

def previous(x):
    x = x.tolist()
    return (
        x[-2] if len(x) > 1 else 0.0
    )

def std(x):
    return np.std(x, ddof=0)

delta.__name__ = 'delta'
current.__name__ = 'current'
diff.__name__ = 'changed'
previous.__name__ = 'previous'

In [None]:
changed_cat_feature_dict = {}
for cat_name in categorical_features:
    ddf[f'{cat_name}'] = ddf[cat_name].astype('category').cat.codes
    changed_cat_feature_dict[f'{cat_name}'] = [curr_value_func, diff_func]

changed_num_feature_dict = {
    num_name: [delta_func, curr_value_func, np.mean, np.std]
    for num_name in numerical_features
}   

changed_num_feature_dict["market_value"] = [delta_func, curr_value_func, previous_value_func, np.mean, std]
changed_num_feature_dict["normalised_market_value"] = [delta_func, curr_value_func, previous_value_func, np.mean, np.std]

gdf = ddf.sort_values("season").groupby("player_id_api").rolling(2, min_periods=1).agg({
    'height_x': curr_value_func,
    **changed_cat_feature_dict,
    **changed_num_feature_dict,
}).reset_index()

gdf.columns = [
    '_'.join([col[0], col[1]] if col[1] != '' else [col[0]]) 
    for col in gdf.columns
]

In [None]:
gdf[gdf.player_id_api == 275]

In [None]:
ddf[ddf.player_id_api == 275]

In [191]:
gdf.isna().sum()

player_id_api                          0
level_1                                0
height_x_current                    5763
team_current                        5717
team_changed                        5717
                                    ... 
normalised_market_value_delta       5717
normalised_market_value_current     5717
normalised_market_value_previous    5717
normalised_market_value_mean        5717
normalised_market_value_std         5717
Length: 235, dtype: int64

In [179]:
players_selected_vars = gdf

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

## Transformation pipeline

In [None]:
# numeric_features = [
#     "age",
#     "matches_played",
#     "matches_started",
#     "90s_played",
#     # "goals_against",
#     # "own_goals_against",
#     # "shots_on_target_against",
#     # "saves",
#     # "save%",
#     # "saves_inside_box",
#     # "penalty_kicks_saves",
#     "height_x",
#     "accurate_crosses.total",
#     "accurate_passes.total",
#     "aerials_won.total",
#     "big_chances_created.total",
#     "big_chances_missed.total",
#     "blocked_shots.total",
#     "clearances.total",
#     "dribbled_past.total",
#     "duels_won.total",
#     "error_lead_to_goal.total",
#     "fouls.total",
#     "goals.goals",
#     "interceptions.total",
#     "offsides.total",
#     "redcards.away",
#     "redcards.home",
#     "redcards.total",
#     "saves_inside_box.total",
#     "shots_off_target.total",
#     "tackles.total_x",
#     "through_balls.total",
#     "total_crosses.total",
#     "total_duels.total",
#     "yellowcards.away",
#     "yellowcards.home",
#     "yellowcards.total",
#     "yellowred_cards.away",
#     "yellowred_cards.home",
#     "yellowred_cards.total",
#     "shots.total",
#     "shots.on",
#     "goals.total_y",
#     "goals.conceded",
#     "goals.assists",
#     "goals.saves",
#     "passes.total_y",
#     "passes.key",
#     "passes.accuracy",
#     "tackles.total_y",
#     "tackles.blocks",
#     "tackles.interceptions",
#     "duels.total",
#     "duels.won",
#     "dribbles.attempts",
#     "dribbles.success",
#     "dribbles.past",
#     "fouls.committed",
#     "cards.yellow",
#     "cards.yellowred",
#     "cards.red",
#     "penalty.commited",
#     "penalty.saved",
# ]
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

# TODO CONTINUE HERE

binary_features = [f"has_{c}" for c in numerical_features if f"has_{c}" in players_selected_vars.columns]
binary_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=0))
    ]
)

categorical_features = [
    "nationality",
    "league",
    "team",
    "tier",
    "position",
    "position_name_y",
]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("bin", binary_transformer, binary_features),
    ]
)

# Predict price bucket

In [None]:
plt.hist(players_selected_vars["market_value"], bins=100)
# plt.xlim((0, 0.5e8))
plt.yscale("log")

In [None]:
bins = [
    0.0025,
    0.005,
    0.01,
    0.025,
    0.05, # 
    0.1, # 8 - 20
    0.25, # 20 - 40
    0.5,  # 40 -80
    1.0  # 80M
]
y = np.digitize(players_selected_vars["normalised_market_value"], bins)
# bins = np.concatenate([
#     np.arange(200_000, 4_000_000, 200_000),
#     np.arange(4_000_000, 20_000_000, 2_000_000),
#     np.arange(20_000_000, 100_000_000, 5_000_000),
#     np.arange(100_000_000, 200_000_001, 10_000_000)
# ])
# y = np.digitize(players_selected_vars["market_value"], bins)
X = players_selected_vars

In [None]:
goalkeepers = (X["position"] == "Goalkeeper").values
attackers = (X["position"] == "Attack").values
defenders = (X["position"] == "Defender").values
midfielders = (X["position"] == "Midfield").values

(
    X_train, X_test, 
    y_train, y_test, 
    goalkeepers_train, goalkeepers_test,
    attackers_train, attackers_test,
    defenders_train, defenders_test, 
    midfielders_train, midfielders_test
) = train_test_split(
    X, y, 
    goalkeepers, attackers, defenders, midfielders,
    test_size=0.2, random_state=42
)

In [None]:
np.unique(y_train, return_counts=True)

In [None]:
random = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", DummyClassifier(strategy="stratified")),
    ]
)
random.fit(X_train, y_train)

y_random = random.predict(X_test)

balanced_accuracy_score(y_test, y_random)

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], random.predict(X_test[selector])))

In [None]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", LogisticRegression()),
    ]
)
clf.fit(X_train, y_train)

y_lr = clf.predict(X_test)

balanced_accuracy_score(y_test, y_lr)

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], clf.predict(X_test[selector])))

Only model specific to goalkeepers performs better on that position than the "all positions" model.
Performance on other positions is similar or marginally better with the generic model.

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(16, 8))),
    ]
)
mlp.fit(X_train, y_train)

y_mlp = mlp.predict(X_test)

balanced_accuracy_score(y_test, y_mlp)

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], mlp.predict(X_test[selector])))

Models specific for goalkeeprs and midfielders are slighlty better than one generic model.

In [None]:
from xgboost import XGBClassifier

xgboost = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", XGBClassifier(learning_rate=1)),
    ]
)
xgboost.fit(X_train, y_train)

y_xgboost = xgboost.predict(X_test)

balanced_accuracy_score(y_test, y_xgboost)

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], xgboost.predict(X_test[selector])))

In [None]:
from collections import defaultdict
arr = np.zeros((10, 10))
for a, b in zip(y_test, y_xgboost):
    arr[a, b] = arr[a, b] + 1

In [None]:
arr

In [None]:
plt.imshow(arr, cmap='hot')
plt.xlabel("Observed")
plt.ylabel("Predicted")
_ = plt.show()

In [None]:
x = np.unique(y_test - y_xgboost, return_counts=True)
a, b =x

In [None]:
cnt = 0
for q,w in zip(a, b):
    print(q, w)
    cnt += w
cnt

In [None]:
(583 + 1397 + 569) / 2951

In [None]:
np.unique(y_lr, return_counts=True)

## Features importance

In [None]:
result = permutation_importance(
    xgboost, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)

forest_importances = pd.Series(result.importances_mean, index=X_test.columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

# Predict market value

In [None]:
from sklearn.metrics import r2_score

In [None]:
def r2_adj(model, x, y):
    r2 = model.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    return 1-(1-r2)*(n-1)/(n-p-1)

def r2_adj2(x, y_test, y_pred):
    r2 = r2_score(y_test, y_pred)
    n = x.shape[0]
    p = x.shape[1]
    return 1-(1-r2)*(n-1)/(n-p-1)

In [None]:
X = players_selected_vars
# y = players_selected_vars["normalised_market_value"]
y = np.log(players_selected_vars["market_value"])
# y = players_selected_vars["market_value"]

In [None]:
X_train = X.loc[X["season"] < 2021]
X_test = X.loc[X["season"] == 2021]
y_train = y.loc[X["season"] < 2021]
y_test = y.loc[X["season"] == 2021]
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
selectors_train = dict()
selectors_test = dict()
for p in ["Goalkeeper", "Attack", "Defender", "Midfield"]:
    selectors_train[p] = (X_train["position"] == p).values
    selectors_test[p] = (X_test["position"] == p).values

In [None]:
# goalkeepers = (X["position"] == "Goalkeeper").values
# attackers = (X["position"] == "Attack").values
# defenders = (X["position"] == "Defender").values
# midfielders = (X["position"] == "Midfield").values

# (
#     X_train, X_test, 
#     y_train, y_test, 
#     goalkeepers_train, goalkeepers_test,
#     attackers_train, attackers_test,
#     defenders_train, defenders_test, 
#     midfielders_train, midfielders_test
# ) = train_test_split(
#     X, y, 
#     goalkeepers, attackers, defenders, midfielders,
#     test_size=0.2, random_state=42
# )

In [None]:
random_regression = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regression", DummyRegressor()),
    ]
)
random_regression.fit(X_train, y_train)

random_regression.score(X_test, y_test), r2_adj(random_regression, X_test, y_test)

In [None]:
for name, selector in selectors_test.items():
    print(name, random_regression.score(X_test[selector], y_test[selector]), r2_adj(random_regression, X_test[selector], y_test[selector]))

In [None]:
y_2020 = X_train.loc[X_train["season"] == 2020].groupby("player_id_api")["market_value"].mean()
y_prev = pd.merge(
    y_2020,
    X_test,
    left_index=True,
    right_on="player_id_api",
    how="right"
)["market_value_x"].fillna(y_2020.mean())

In [None]:
r2_score(np.exp(y_test), y_prev), r2_adj2(X_test, np.exp(y_test), y_prev)

In [None]:
for name, selector in selectors_test.items():
    print(name, r2_score(np.exp(y_test)[selector], y_prev[selector]), r2_adj2(X_test[selector], np.exp(y_test)[selector], y_prev[selector]))

In [None]:
regression = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regression", LinearRegression()),
    ]
)
regression.fit(X_train, y_train)

y_reg = regression.predict(X_test)

# regression.score(X_test, y_test), r2_adj(regression, X_test, y_test)
r2_score(np.exp(y_test), np.exp(y_reg)), r2_adj2(X_test, np.exp(y_test), np.exp(y_reg))

In [None]:
for name, selector in selectors_test.items():
    y_reg = regression.predict(X_test[selector])
    print(name, r2_score(np.exp(y_test[selector]), np.exp(y_reg)), r2_adj2(X_test[selector], np.exp(y_test[selector]), np.exp(y_reg)))
    # print(name, regression.score(X_test[selector], y_test[selector]), r2_adj(regression, X_test[selector], y_test[selector]))

Only model specific for midfielder is slightly better than the generic one.

In [None]:
from sklearn.neural_network import MLPRegressor

mlp_reg = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("regression", MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(16, 8))),
    ]
)
mlp_reg.fit(X_train, y_train)

y_mlp = mlp_reg.predict(X_test)

# mlp_reg.score(X_test, y_test), r2_adj(mlp_reg, X_test, y_test)
r2_score(np.exp(y_test), np.exp(y_mlp)), r2_adj2(X_test, np.exp(y_test), np.exp(y_mlp))

In [None]:
for name, selector in selectors_test.items():
    y_mlp = mlp_reg.predict(X_test[selector])
    print(name, r2_score(np.exp(y_test[selector]), np.exp(y_mlp)), r2_adj2(X_test[selector], np.exp(y_test[selector]), np.exp(y_mlp)))
    # print(name, mlp_reg.score(X_test[selector], y_test[selector]), r2_adj(mlp_reg, X_test[selector], y_test[selector]))

"All positions" model performs better than position specific models.

In [None]:
from xgboost import XGBRegressor

xgboost_reg = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("regression", XGBRegressor()),
    ]
)
xgboost_reg.fit(X_train, y_train)

y_xgboost = xgboost_reg.predict(X_test)


# xgboost_reg.score(X_test, y_test), r2_adj(xgboost_reg, X_test, y_test)
r2_score(np.exp(y_test), np.exp(y_xgboost)), r2_adj2(X_test, np.exp(y_test), np.exp(y_xgboost))

In [None]:
for name, selector in selectors_test.items():
    print(name, xgboost_reg.score(X_test[selector], y_test[selector]), r2_adj(xgboost_reg, X_test[selector], y_test[selector]))

In [None]:
plt.scatter(y_train, regression.predict(X_train), alpha=0.5, label="lr")
plt.scatter(y_train, random_regression.predict(X_train), alpha=0.5, label="random")
plt.scatter(y_train, mlp_reg.predict(X_train), alpha=0.5, label="MLP")
plt.scatter(y_train, xgboost_reg.predict(X_train), alpha=0.5, label="XGBoost")
plt.axline((0, 0), (1, 1), c="k")
plt.xlim(np.floor(y_train.min())-0.1, np.ceil(y_test.max())+0.1)
plt.ylim(np.floor(y_train.min())-0.1, np.ceil(y_test.max())+0.1)
plt.xlabel("Observed")
plt.ylabel("Predicted")
plt.legend()
_ = plt.show()

In [None]:
plt.scatter(y_test, regression.predict(X_test), alpha=0.25, label="lr")
# plt.scatter(y_test, random_regression.predict(X_test), alpha=0.5, label="random")
plt.scatter(y_test, mlp_reg.predict(X_test), alpha=0.25, label="MLP")
plt.scatter(y_test, xgboost_reg.predict(X_test), alpha=0.25, label="XGBoost")
plt.scatter(y_test, np.log(y_prev), alpha=0.25, label="prev_season")
plt.axline((0, 0), (1, 1), c="k")
plt.xlim(np.floor(y_test.min())-0.1, np.ceil(y_test.max())+0.1)
plt.ylim(np.floor(y_test.min())-0.1, np.ceil(y_test.max())+0.1)
plt.xlabel("Observed")
plt.ylabel("Predicted")
plt.legend()
_ = plt.show()

In [None]:
reg_results = pd.DataFrame({
    "y": np.exp(y_test),
    "y_pred": np.exp(xgboost_reg.predict(X_test))
    # "y": y_test,
    # "y_pred": xgboost_reg.predict(X_test)
})
reg_results["diff"] = reg_results["y_pred"] - reg_results["y"]
reg_results["rel_error"] = reg_results["diff"] / reg_results["y"] * 100
reg_results["abs_rel_error"] = np.abs(reg_results["rel_error"])
reg_results

In [None]:
reg_results["y"].rank()

In [None]:
np.mean(reg_results["abs_rel_error"])

In [None]:
_data = reg_results.loc[reg_results["y"] >= 1_000_000]
plt.scatter(_data["y"].rank(), _data["rel_error"])
# plt.xscale("log")

In [None]:
_ = plt.hist(_data["abs_rel_error"], bins=100)

In [None]:
_data.loc[_data["abs_rel_error"] < 20].sort_values("abs_rel_error")

In [None]:
_data.sort_values("abs_rel_error")

## Features importance

In [None]:
result = permutation_importance(
    xgboost_reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)

In [None]:
non_zero_feats = np.abs(result.importances_mean) > 1e-03
forest_importances = pd.Series(result.importances_mean[non_zero_feats], index=X_test.columns.values[non_zero_feats])

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std[non_zero_feats], ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

# Predict change in market value between seasons

In [None]:
before_after_seasons = pd.merge(
    players_selected_vars, 
    players_selected_vars, 
    on="player_id_api", 
    suffixes=["_before", "_after"]
).query("season_after - season_before == 1")

In [None]:
numeric_features_ba = [
    f"{c}_{suffix}" for c in [
        "age",
        "matches_played",
        "matches_started",
        "90s_played",
        # "goals_against",
        # "own_goals_against",
        # "shots_on_target_against",
        # "saves",
        # "save%",
        # "saves_inside_box",
        # "penalty_kicks_saves",
    "height_x",
    "accurate_crosses.total",
    "accurate_passes.total",
    "aerials_won.total",
    "big_chances_created.total",
    "big_chances_missed.total",
    "blocked_shots.total",
    "clearances.total",
    "dribbled_past.total",
    "duels_won.total",
    "error_lead_to_goal.total",
    "fouls.total",
    "goals.goals",
    "interceptions.total",
    "offsides.total",
    "redcards.away",
    "redcards.home",
    "redcards.total",
    "saves_inside_box.total",
    "shots_off_target.total",
    "tackles.total_x",
    "through_balls.total",
    "total_crosses.total",
    "total_duels.total",
    "yellowcards.away",
    "yellowcards.home",
    "yellowcards.total",
    "yellowred_cards.away",
    "yellowred_cards.home",
    "yellowred_cards.total",
    "shots.total",
    "shots.on",
    "goals.total_y",
    "goals.conceded",
    "goals.assists",
    "goals.saves",
    "passes.total_y",
    "passes.key",
    "passes.accuracy",
    "tackles.total_y",
    "tackles.blocks",
    "tackles.interceptions",
    "duels.total",
    "duels.won",
    "dribbles.attempts",
    "dribbles.success",
    "dribbles.past",
    "fouls.committed",
    "cards.yellow",
    "cards.yellowred",
    "cards.red",
    "penalty.commited",
    "penalty.saved",
    ]
    for suffix in ["before","after"]
]
numeric_transformer_ba = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

categorical_features_ba = [
    f"{c}_{suffix}" for c in [
        "nationality",
        "league",
        "team",
        "tier",
        "position",
    ]
    for suffix in ["before","after"]
]
categorical_transformer_ba = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor_ba = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_ba, numeric_features_ba),
        ("cat", categorical_transformer_ba, categorical_features_ba),
    ]
)

In [None]:
X = before_after_seasons
y_before = np.digitize(before_after_seasons["normalised_market_value_before"], bins)
y_after = np.digitize(before_after_seasons["normalised_market_value_after"], bins)
y = np.minimum(np.maximum(y_after - y_before, -1), 1)

In [None]:
goalkeepers = (X["position_before"] == "Goalkeeper").values
attackers = (X["position_before"] == "Attack").values
defenders = (X["position_before"] == "Defender").values
midfielders = (X["position_before"] == "Midfield").values

(
    X_train, X_test, 
    y_train, y_test, 
    goalkeepers_train, goalkeepers_test,
    attackers_train, attackers_test,
    defenders_train, defenders_test, 
    midfielders_train, midfielders_test
) = train_test_split(
    X, y, 
    goalkeepers, attackers, defenders, midfielders,
    test_size=0.2, random_state=42
)

In [None]:
random = Pipeline(
    steps=[
        ("preprocessor", preprocessor_ba),
        ("classifier", DummyClassifier(strategy="stratified")),
    ]
)
random.fit(X_train, y_train)

balanced_accuracy_score(y_test, random.predict(X_test))

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], random.predict(X_test[selector])))

In [None]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor_ba), 
        ("classifier", LogisticRegression()),
    ]
)
clf.fit(X_train, y_train)

print(confusion_matrix(y_test, clf.predict(X_test), labels=[1, 0, -1]))

balanced_accuracy_score(y_test, clf.predict(X_test))

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], clf.predict(X_test[selector])))

"All positions" model performs better on all positions.

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = Pipeline(
    steps=[
        ("preprocessor", preprocessor_ba), 
        ("classifier", MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(32, 16))),
    ]
)
mlp.fit(X_train, y_train)

print(confusion_matrix(y_test, mlp.predict(X_test), labels=[1, 0, -1]))

balanced_accuracy_score(y_test, mlp.predict(X_test))

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], mlp.predict(X_test[selector])))

Only model specific for midfielders perform marginally better.

In [None]:
from xgboost import XGBClassifier

xgboost = Pipeline(
    steps=[
        ("preprocessor", preprocessor_ba), 
        ("classifier", XGBClassifier()),
    ]
)
xgboost.fit(X_train, y_train+1)

y_xgboost = xgboost.predict(X_test)

balanced_accuracy_score(y_test+1, y_xgboost)

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector]+1, xgboost.predict(X_test[selector])))

XGBoost performs here better than MLP, and comparable as position-specific models.

## Features importance

In [None]:
result = permutation_importance(
    xgboost, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)

forest_importances = pd.Series(result.importances_mean, index=X_test.columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()