In [None]:
from smart_open import open
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize

# Load data for all positions

In [None]:
data = pd.read_csv("../data/training_pruned_file_20162021.csv")

In [None]:
players = data

In [None]:
sorted(players.columns.values)

In [None]:
data["league_name"].unique()

## Select variables

In [None]:
players_selected_vars = players[[
    "name",
    "player_id_api",
    "date_of_birth",
    "nationality",
    "team_name",
    'league_name',
     'league.season',
    # "error_lead_to_goal.total",
    # "saves_inside_box.total",
    "games.appearences",
    "games.lineups",
    "games.minutes",
    # "goals.conceded",
    # "goals.saves",
    # "penalty.saved",
    # "highest_market_value_in_eur",
    "market_value_in_eur_x",
    # "own_goals.total",
    "position",
    "position_name_y",
    "height_x",
    "accurate_crosses.total",
    "accurate_passes.total",
    "aerials_won.total",
    "big_chances_created.total",
    "big_chances_missed.total",
    "blocked_shots.total",
    "clearances.total",
    "dribbled_past.total",
    "duels_won.total",
    "error_lead_to_goal.total",
    "fouls.total",
    "goals.goals",
    "interceptions.total",
    "offsides.total",
    "redcards.away",
    "redcards.home",
    "redcards.total",
    "saves_inside_box.total",
    "shots_off_target.total",
    "tackles.total_x",
    "through_balls.total",
    "total_crosses.total",
    "total_duels.total",
    "yellowcards.away",
    "yellowcards.home",
    "yellowcards.total",
    "yellowred_cards.away",
    "yellowred_cards.home",
    "yellowred_cards.total",
    "shots.total",
    "shots.on",
    "goals.total_y",
    "goals.conceded",
    "goals.assists",
    "goals.saves",
    "passes.total_y",
    "passes.key",
    "passes.accuracy",
    "tackles.total_y",
    "tackles.blocks",
    "tackles.interceptions",
    "duels.total",
    "duels.won",
    "dribbles.attempts",
    "dribbles.success",
    "dribbles.past",
    "fouls.committed",
    "cards.yellow",
    "cards.yellowred",
    "cards.red",
    "penalty.commited",
    "penalty.saved",
]].copy().rename({
    "team_name": "team",
    "league_name": "league",
    "league.season": "season",
    "games.appearences": "matches_played",
    "games.lineups": "matches_started",
    "games.minutes": "minutes",
    # "goals.conceded": "goals_against",
    # "goals.saves": "saves",
    # "saves_inside_box.total": "saves_inside_box",
    # "penalty.saved": "penalty_kicks_saves",
    # "own_goals.total": "own_goals_against",
    # "error_lead_to_goal.total": "errors_to_goal",
    "market_value_in_eur_x": "market_value",
}, axis=1)
players_selected_vars["90s_played"] = players_selected_vars["minutes"] / 90
# players_selected_vars["goals_against_90s"] = players_selected_vars["goals_against"] / players_selected_vars["90s_played"]
# players_selected_vars["shots_on_target_against"] = players_selected_vars["goals_against"] + players_selected_vars["saves"]
# players_selected_vars["save%"] = players_selected_vars["saves"] / players_selected_vars["shots_on_target_against"]
# players_selected_vars["age"] = pd.to_datetime(players_selected_vars["date_of_birth"]).map(lambda x: int((pd.Timestamp.today() - x).days / 365))
players_selected_vars["age"] = (30 - (players["league.season"] - players["date_of_birth"].str.slice(0, 4).astype(int)))
players_selected_vars["age2"] = players_selected_vars["age"] ** 2
# players_selected_vars["own_goals_against"] = players_selected_vars["own_goals_against"].fillna(0)
for per_90 in [
    # "shots_on_target_against",
    # "saves_inside_box",
    # "saves",
    # "penalty_kicks_saves",
    # "own_goals_against",
    # "goals_against",
    # "accurate_crosses.total",
    # "big_chances_created.total",
    # "big_chances_missed.total",
    # "blocked_shots.total",
    # "dribbled_past.total",
    # "goals.goals",
    # "offsides.total",
    # "shots_off_target.total",
    # "through_balls.total",
    # "total_crosses.total",
    # "shots.total",
    # "shots.on",
    # "goals.total_y",
    # "goals.assists",
    # "passes.key",
    # "dribbles.attempts",
    # "dribbles.success",
    # "dribbles.past",
]:
    players_selected_vars[per_90] = players_selected_vars[per_90] / players_selected_vars["90s_played"]


players_selected_vars.replace([np.inf, -np.inf], np.nan, inplace=True)
    
players_selected_vars = players_selected_vars.drop([
    "date_of_birth",
    "minutes",
], axis=1)

order = [
    "name",
    "player_id_api",
    "age",
    "age2",
    "nationality",
    "league",
    "team",
    "season",
    "matches_played",
    "matches_started",
    # "minutes",
    "90s_played",
    # "goals_against",
    # "own_goals_against",
    # "shots_on_target_against",
    # "saves",
    # "save%",
    # "saves_inside_box",
    # "penalty_kicks_saves",
    "market_value",
    "position",
    "position_name_y",
    "height_x",
    "accurate_crosses.total",
    "accurate_passes.total",
    "aerials_won.total",
    "big_chances_created.total",
    "big_chances_missed.total",
    "blocked_shots.total",
    "clearances.total",
    "dribbled_past.total",
    "duels_won.total",
    "error_lead_to_goal.total",
    "fouls.total",
    "goals.goals",
    "interceptions.total",
    "offsides.total",
    "redcards.away",
    "redcards.home",
    "redcards.total",
    "saves_inside_box.total",
    "shots_off_target.total",
    "tackles.total_x",
    "through_balls.total",
    "total_crosses.total",
    "total_duels.total",
    "yellowcards.away",
    "yellowcards.home",
    "yellowcards.total",
    "yellowred_cards.away",
    "yellowred_cards.home",
    "yellowred_cards.total",
    "shots.total",
    "shots.on",
    "goals.total_y",
    "goals.conceded",
    "goals.assists",
    "goals.saves",
    "passes.total_y",
    "passes.key",
    "passes.accuracy",
    "tackles.total_y",
    "tackles.blocks",
    "tackles.interceptions",
    "duels.total",
    "duels.won",
    "dribbles.attempts",
    "dribbles.success",
    "dribbles.past",
    "fouls.committed",
    "cards.yellow",
    "cards.yellowred",
    "cards.red",
    "penalty.commited",
    "penalty.saved",
]
players_selected_vars = players_selected_vars[order + list(set(players_selected_vars.columns).difference(order))]
players_selected_vars

In [None]:
from helpers import TEAM_TIERS

In [None]:
players_selected_vars = pd.merge(players_selected_vars, TEAM_TIERS, left_on="team", right_index=True)

In [None]:
players_selected_vars.loc[players_selected_vars["tier"] == "-1", "tier"] = None

In [None]:
# for c in players_selected_vars.columns:
#     players_selected_vars[f"has_{c}"] = (~players_selected_vars[c].isnull()).astype(int)

In [None]:
mvs = players_selected_vars["market_value"].values
mvs = np.expand_dims(mvs, axis=1)
mvs = normalize(mvs, norm="max", axis=0)
mvs = np.squeeze(mvs)
players_selected_vars["normalised_market_value"] = mvs

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [None]:
mv_changes = pd.merge(players_selected_vars, players_selected_vars, on="player_id_api", suffixes=["_l","_r"]).query("season_r - season_l == 1")[["player_id_api", "season_r","market_value_r", "market_value_l"]]
mv_changes["change"] = (mv_changes["market_value_r"] - mv_changes["market_value_l"]) / mv_changes["market_value_l"] * 100
mv_changes = mv_changes.drop(columns=["market_value_r", "market_value_l"]).rename(columns={"season_r": "season"})
mv_changes

## Transformation pipeline

In [None]:
numeric_features = [
    "age",
    "age2",
    "matches_played",
    "matches_started",
    "90s_played",
    # "goals_against",
    # "own_goals_against",
    # "shots_on_target_against",
    # "saves",
    # "save%",
    # "saves_inside_box",
    # "penalty_kicks_saves",
    "height_x",
    "accurate_crosses.total",
    "accurate_passes.total",
    "aerials_won.total",
    "big_chances_created.total",
    "big_chances_missed.total",
    "blocked_shots.total",
    "clearances.total",
    "dribbled_past.total",
    "duels_won.total",
    "error_lead_to_goal.total",
    "fouls.total",
    "goals.goals",
    "interceptions.total",
    "offsides.total",
    "redcards.away",
    "redcards.home",
    "redcards.total",
    "saves_inside_box.total",
    "shots_off_target.total",
    "tackles.total_x",
    "through_balls.total",
    "total_crosses.total",
    "total_duels.total",
    "yellowcards.away",
    "yellowcards.home",
    "yellowcards.total",
    "yellowred_cards.away",
    "yellowred_cards.home",
    "yellowred_cards.total",
    "shots.total",
    "shots.on",
    "goals.total_y",
    "goals.conceded",
    "goals.assists",
    "goals.saves",
    "passes.total_y",
    "passes.key",
    "passes.accuracy",
    "tackles.total_y",
    "tackles.blocks",
    "tackles.interceptions",
    "duels.total",
    "duels.won",
    "dribbles.attempts",
    "dribbles.success",
    "dribbles.past",
    "fouls.committed",
    "cards.yellow",
    "cards.yellowred",
    "cards.red",
    "penalty.commited",
    "penalty.saved",
]
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

# binary_features = [f"has_{c}" for c in numeric_features if f"has_{c}" in players_selected_vars.columns]
# binary_transformer = Pipeline(
#     steps=[
#         ("imputer", SimpleImputer(strategy="constant", fill_value=0))
#     ]
# )

categorical_features = [
    "nationality",
    "league",
    "team",
    "tier",
    "position",
    "position_name_y",
]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        # ("bin", binary_transformer, binary_features),
    ]
)

# Predict price bucket

In [None]:
plt.hist(players_selected_vars["market_value"], bins=100)
# plt.xlim((0, 0.5e8))
plt.yscale("log")

In [None]:
bins = [
    0.0025,
    0.005,
    0.01,
    0.025,
    0.05, # 
    0.1, # 8 - 20
    0.25, # 20 - 40
    0.5,  # 40 -80
    1.0  # 80M
]
y = np.digitize(players_selected_vars["normalised_market_value"], bins)
# bins = np.concatenate([
#     np.arange(200_000, 4_000_000, 200_000),
#     np.arange(4_000_000, 20_000_000, 2_000_000),
#     np.arange(20_000_000, 100_000_000, 5_000_000),
#     np.arange(100_000_000, 200_000_001, 10_000_000)
# ])
# y = np.digitize(players_selected_vars["market_value"], bins)
X = players_selected_vars

In [None]:
goalkeepers = (X["position"] == "Goalkeeper").values
attackers = (X["position"] == "Attack").values
defenders = (X["position"] == "Defender").values
midfielders = (X["position"] == "Midfield").values

(
    X_train, X_test, 
    y_train, y_test, 
    goalkeepers_train, goalkeepers_test,
    attackers_train, attackers_test,
    defenders_train, defenders_test, 
    midfielders_train, midfielders_test
) = train_test_split(
    X, y, 
    goalkeepers, attackers, defenders, midfielders,
    test_size=0.2, random_state=42
)

In [None]:
np.unique(y_train, return_counts=True)

In [None]:
random = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", DummyClassifier(strategy="stratified")),
    ]
)
random.fit(X_train, y_train)

y_random = random.predict(X_test)

balanced_accuracy_score(y_test, y_random)

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], random.predict(X_test[selector])))

In [None]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", LogisticRegression()),
    ]
)
clf.fit(X_train, y_train)

y_lr = clf.predict(X_test)

balanced_accuracy_score(y_test, y_lr)

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], clf.predict(X_test[selector])))

Only model specific to goalkeepers performs better on that position than the "all positions" model.
Performance on other positions is similar or marginally better with the generic model.

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(16, 8))),
    ]
)
mlp.fit(X_train, y_train)

y_mlp = mlp.predict(X_test)

balanced_accuracy_score(y_test, y_mlp)

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], mlp.predict(X_test[selector])))

Models specific for goalkeeprs and midfielders are slighlty better than one generic model.

In [None]:
from xgboost import XGBClassifier

xgboost = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("classifier", XGBClassifier(learning_rate=1)),
    ]
)
xgboost.fit(X_train, y_train)

y_xgboost = xgboost.predict(X_test)

balanced_accuracy_score(y_test, y_xgboost)

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], xgboost.predict(X_test[selector])))

In [None]:
from collections import defaultdict
arr = np.zeros((10, 10))
for a, b in zip(y_test, y_xgboost):
    arr[a, b] = arr[a, b] + 1

In [None]:
arr

In [None]:
plt.imshow(arr, cmap='hot')
plt.xlabel("Observed")
plt.ylabel("Predicted")
_ = plt.show()

In [None]:
x = np.unique(y_test - y_xgboost, return_counts=True)
a, b =x

In [None]:
cnt = 0
for q,w in zip(a, b):
    print(q, w)
    cnt += w
cnt

In [None]:
(583 + 1397 + 569) / 2951

In [None]:
np.unique(y_lr, return_counts=True)

## Features importance

In [None]:
result = permutation_importance(
    xgboost, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)

forest_importances = pd.Series(result.importances_mean, index=X_test.columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

# Predict market value

In [None]:
from sklearn.metrics import r2_score

In [None]:
def r2_adj(model, x, y):
    r2 = model.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    return 1-(1-r2)*(n-1)/(n-p-1)

def r2_adj2(x, y_test, y_pred):
    r2 = r2_score(y_test, y_pred)
    n = x.shape[0]
    p = x.shape[1]
    return 1-(1-r2)*(n-1)/(n-p-1)

In [None]:
players_with_same_values = set(pd.merge(players_selected_vars, players_selected_vars, on="player_id_api").query("season_x - season_y == 1").query("market_value_x == market_value_y")["player_id_api"].unique())

In [None]:
# X = players_selected_vars.query("player_id_api not in @players_with_same_values")
# y = players_selected_vars["normalised_market_value"]
# y = np.log(players_selected_vars.query("player_id_api not in @players_with_same_values")["market_value"])
X = players_selected_vars.loc[lambda x: x["matches_played"] > 15]
y = np.log(players_selected_vars.loc[lambda x: x["matches_played"] > 15]["market_value"])

In [None]:
X_train = X.loc[X["season"] <= 2020]
X_test = X.loc[X["season"] == 2021]
y_train = y.loc[X["season"] <= 2020]
y_test = y.loc[X["season"] == 2021]
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
selectors_train = dict()
selectors_test = dict()
for p in ["Goalkeeper", "Attack", "Defender", "Midfield"]:
    selectors_train[p] = (X_train["position"] == p).values
    selectors_test[p] = (X_test["position"] == p).values

In [None]:
# goalkeepers = (X["position"] == "Goalkeeper").values
# attackers = (X["position"] == "Attack").values
# defenders = (X["position"] == "Defender").values
# midfielders = (X["position"] == "Midfield").values

# (
#     X_train, X_test, 
#     y_train, y_test, 
#     goalkeepers_train, goalkeepers_test,
#     attackers_train, attackers_test,
#     defenders_train, defenders_test, 
#     midfielders_train, midfielders_test
# ) = train_test_split(
#     X, y, 
#     goalkeepers, attackers, defenders, midfielders,
#     test_size=0.2, random_state=42
# )

In [None]:
random_regression = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regression", DummyRegressor()),
    ]
)
random_regression.fit(X_train, y_train)

for _X, _y in ((X_train, y_train), (X_test, y_test)):
    print(random_regression.score(_X, _y), r2_adj(random_regression, _X, _y))

In [None]:
for name, selector in selectors_test.items():
    print(name, random_regression.score(X_test[selector], y_test[selector]), r2_adj(random_regression, X_test[selector], y_test[selector]))

In [None]:
y_2020 = X_train.loc[X_train["season"] == 2020].groupby("player_id_api")["market_value"].mean()
y_prev = pd.merge(
    y_2020,
    X_test,
    left_index=True,
    right_on="player_id_api",
    how="right"
)["market_value_x"].fillna(y_2020.mean())

In [None]:
# X_test2 = X_test[~pd.isna(y_prev)]
# y_test2 = y_test[~pd.isna(y_prev)]
# y_prev2 = y_prev[~pd.isna(y_prev)]

In [None]:
r2_score(np.exp(y_test), y_prev), r2_adj2(X_test, np.exp(y_test), y_prev)
# r2_score(np.exp(y_test2), y_prev2), r2_adj2(X_test2, np.exp(y_test2), y_prev2)

In [None]:
for name, selector in selectors_test.items():
    print(name, r2_score(np.exp(y_test)[selector], y_prev[selector]), r2_adj2(X_test[selector], np.exp(y_test)[selector], y_prev[selector]))

In [None]:
regression = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regression", LinearRegression()),
    ]
)
regression.fit(X_train, y_train)

y_reg = regression.predict(X_test)

# regression.score(X_test, y_test), r2_adj(regression, X_test, y_test)
r2_score(np.exp(y_test), np.exp(y_reg)), r2_adj2(X_test, np.exp(y_test), np.exp(y_reg))

In [None]:
for name, selector in selectors_test.items():
    y_reg = regression.predict(X_test[selector])
    print(name, r2_score(np.exp(y_test[selector]), np.exp(y_reg)), r2_adj2(X_test[selector], np.exp(y_test[selector]), np.exp(y_reg)))
    # print(name, regression.score(X_test[selector], y_test[selector]), r2_adj(regression, X_test[selector], y_test[selector]))

Only model specific for midfielder is slightly better than the generic one.

In [None]:
from sklearn.neural_network import MLPRegressor

mlp_reg = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("regression", MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(16, 8))),
    ]
)
mlp_reg.fit(X_train, y_train)

y_mlp = mlp_reg.predict(X_test)

# mlp_reg.score(X_test, y_test), r2_adj(mlp_reg, X_test, y_test)
r2_score(np.exp(y_test), np.exp(y_mlp)), r2_adj2(X_test, np.exp(y_test), np.exp(y_mlp))

In [None]:
for name, selector in selectors_test.items():
    y_mlp = mlp_reg.predict(X_test[selector])
    print(name, r2_score(np.exp(y_test[selector]), np.exp(y_mlp)), r2_adj2(X_test[selector], np.exp(y_test[selector]), np.exp(y_mlp)))
    # print(name, mlp_reg.score(X_test[selector], y_test[selector]), r2_adj(mlp_reg, X_test[selector], y_test[selector]))

"All positions" model performs better than position specific models.

In [None]:
from xgboost import XGBRegressor

xgboost_reg = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("regression", XGBRegressor()),
    ]
)
xgboost_reg.fit(X_train, y_train)

y_xgboost = xgboost_reg.predict(X_test)


# xgboost_reg.score(X_test, y_test), r2_adj(xgboost_reg, X_test, y_test)
print(xgboost_reg.score(X_train, y_train), r2_adj(xgboost_reg, X_train, y_train), xgboost_reg.score(X_test, y_test), r2_adj(xgboost_reg, X_test, y_test))

In [None]:
###### from xgboost import XGBRegressor

xgboost_reg = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("regression", XGBRegressor(n_estimators=1000, max_depth=2, reg_lambda=40, reg_alpha=0)),
    ]
)
xgboost_reg.fit(X_train, y_train)

print(xgboost_reg.score(X_train, y_train), r2_adj(xgboost_reg, X_train, y_train), xgboost_reg.score(X_test, y_test), r2_adj(xgboost_reg, X_test, y_test))

In [None]:
for name, selector in selectors_test.items():
    print(name, xgboost_reg.score(X_test[selector], y_test[selector]), r2_adj(xgboost_reg, X_test[selector], y_test[selector]))

In [None]:
plt.scatter(y_train, regression.predict(X_train), alpha=0.5, label="lr")
plt.scatter(y_train, random_regression.predict(X_train), alpha=0.5, label="random")
# plt.scatter(y_train, mlp_reg.predict(X_train), alpha=0.5, label="MLP")
plt.scatter(y_train, xgboost_reg.predict(X_train), alpha=0.5, label="XGBoost")
plt.axline((0, 0), (1, 1), c="k")
plt.xlim(np.floor(y_train.min())-0.1, np.ceil(y_test.max())+0.1)
plt.ylim(np.floor(y_train.min())-0.1, np.ceil(y_test.max())+0.1)
plt.xlabel("Observed")
plt.ylabel("Predicted")
plt.legend()
_ = plt.show()

In [None]:
plt.scatter(y_test, regression.predict(X_test), alpha=0.25, label="lr")
# plt.scatter(y_test, random_regression.predict(X_test), alpha=0.5, label="random")
# plt.scatter(y_test, mlp_reg.predict(X_test), alpha=0.25, label="MLP")
plt.scatter(y_test, xgboost_reg.predict(X_test), alpha=0.25, label="XGBoost")
plt.scatter(y_test, np.log(y_prev), alpha=0.25, label="prev_season")
plt.axline((0, 0), (1, 1), c="k")
plt.xlim(np.floor(y_test.min())-0.1, np.ceil(y_test.max())+0.1)
plt.ylim(np.floor(y_test.min())-0.1, np.ceil(y_test.max())+0.1)
plt.xlabel("Observed")
plt.ylabel("Predicted")
plt.legend()
_ = plt.show()

In [None]:
reg_results = pd.DataFrame({
    "y": np.exp(y_test),
    "y_pred": np.exp(xgboost_reg.predict(X_test))
    # "y_pred": np.exp(regression.predict(X_test))
    # "y": y_test,
    # "y_pred": xgboost_reg.predict(X_test)
})
reg_results["diff"] = reg_results["y_pred"] - reg_results["y"]
reg_results["rel_error"] = reg_results["diff"] / reg_results["y"] * 100
reg_results["abs_rel_error"] = np.abs(reg_results["rel_error"])
reg_results = reg_results.sort_values("abs_rel_error")
reg_results

In [None]:
plt.plot(
    reg_results["abs_rel_error"].rank() / len(reg_results["abs_rel_error"]) * 100,
    reg_results["abs_rel_error"].cumsum() / reg_results["abs_rel_error"].rank()
)
plt.xticks(np.linspace(0, 1, 11)*100)
plt.xlabel("% of players (any position with at least 15 appearance)")
plt.ylabel("Error [%]")
plt.grid()
plt.show()

In [None]:
np.mean(reg_results["abs_rel_error"])

## Features importance

In [None]:
result = permutation_importance(
    xgboost_reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)

In [None]:
non_zero_feats = np.abs(result.importances_mean) > 1e-03
forest_importances = pd.Series(result.importances_mean[non_zero_feats], index=X_test.columns.values[non_zero_feats])

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std[non_zero_feats], ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

# Predict change in market value between seasons

In [None]:
before_after_seasons = pd.merge(
    players_selected_vars, 
    players_selected_vars, 
    on="player_id_api", 
    suffixes=["_before", "_after"]
).query("season_after - season_before == 1")

In [None]:
numeric_features_ba = [
    f"{c}_{suffix}" for c in [
        "age",
        "matches_played",
        "matches_started",
        "90s_played",
        # "goals_against",
        # "own_goals_against",
        # "shots_on_target_against",
        # "saves",
        # "save%",
        # "saves_inside_box",
        # "penalty_kicks_saves",
    "height_x",
    "accurate_crosses.total",
    "accurate_passes.total",
    "aerials_won.total",
    "big_chances_created.total",
    "big_chances_missed.total",
    "blocked_shots.total",
    "clearances.total",
    "dribbled_past.total",
    "duels_won.total",
    "error_lead_to_goal.total",
    "fouls.total",
    "goals.goals",
    "interceptions.total",
    "offsides.total",
    "redcards.away",
    "redcards.home",
    "redcards.total",
    "saves_inside_box.total",
    "shots_off_target.total",
    "tackles.total_x",
    "through_balls.total",
    "total_crosses.total",
    "total_duels.total",
    "yellowcards.away",
    "yellowcards.home",
    "yellowcards.total",
    "yellowred_cards.away",
    "yellowred_cards.home",
    "yellowred_cards.total",
    "shots.total",
    "shots.on",
    "goals.total_y",
    "goals.conceded",
    "goals.assists",
    "goals.saves",
    "passes.total_y",
    "passes.key",
    "passes.accuracy",
    "tackles.total_y",
    "tackles.blocks",
    "tackles.interceptions",
    "duels.total",
    "duels.won",
    "dribbles.attempts",
    "dribbles.success",
    "dribbles.past",
    "fouls.committed",
    "cards.yellow",
    "cards.yellowred",
    "cards.red",
    "penalty.commited",
    "penalty.saved",
    ]
    for suffix in ["before","after"]
]
numeric_transformer_ba = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

categorical_features_ba = [
    f"{c}_{suffix}" for c in [
        "nationality",
        "league",
        "team",
        "tier",
        "position",
    ]
    for suffix in ["before","after"]
]
categorical_transformer_ba = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor_ba = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_ba, numeric_features_ba),
        ("cat", categorical_transformer_ba, categorical_features_ba),
    ]
)

In [None]:
X = before_after_seasons
y_before = np.digitize(before_after_seasons["normalised_market_value_before"], bins)
y_after = np.digitize(before_after_seasons["normalised_market_value_after"], bins)
y = np.minimum(np.maximum(y_after - y_before, -1), 1)

In [None]:
goalkeepers = (X["position_before"] == "Goalkeeper").values
attackers = (X["position_before"] == "Attack").values
defenders = (X["position_before"] == "Defender").values
midfielders = (X["position_before"] == "Midfield").values

(
    X_train, X_test, 
    y_train, y_test, 
    goalkeepers_train, goalkeepers_test,
    attackers_train, attackers_test,
    defenders_train, defenders_test, 
    midfielders_train, midfielders_test
) = train_test_split(
    X, y, 
    goalkeepers, attackers, defenders, midfielders,
    test_size=0.2, random_state=42
)

In [None]:
random = Pipeline(
    steps=[
        ("preprocessor", preprocessor_ba),
        ("classifier", DummyClassifier(strategy="stratified")),
    ]
)
random.fit(X_train, y_train)

balanced_accuracy_score(y_test, random.predict(X_test))

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], random.predict(X_test[selector])))

In [None]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor_ba), 
        ("classifier", LogisticRegression()),
    ]
)
clf.fit(X_train, y_train)

print(confusion_matrix(y_test, clf.predict(X_test), labels=[1, 0, -1]))

balanced_accuracy_score(y_test, clf.predict(X_test))

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], clf.predict(X_test[selector])))

"All positions" model performs better on all positions.

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = Pipeline(
    steps=[
        ("preprocessor", preprocessor_ba), 
        ("classifier", MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(32, 16))),
    ]
)
mlp.fit(X_train, y_train)

print(confusion_matrix(y_test, mlp.predict(X_test), labels=[1, 0, -1]))

balanced_accuracy_score(y_test, mlp.predict(X_test))

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector], mlp.predict(X_test[selector])))

Only model specific for midfielders perform marginally better.

In [None]:
from xgboost import XGBClassifier

xgboost = Pipeline(
    steps=[
        ("preprocessor", preprocessor_ba), 
        ("classifier", XGBClassifier()),
    ]
)
xgboost.fit(X_train, y_train+1)

y_xgboost = xgboost.predict(X_test)

balanced_accuracy_score(y_test+1, y_xgboost)

In [None]:
for name, selector in {
    "goalkeepers": goalkeepers_test, 
    "attackers": attackers_test, 
    "defenders": defenders_test,
    "midfielders": midfielders_test
}.items():
    print(name, balanced_accuracy_score(y_test[selector]+1, xgboost.predict(X_test[selector])))

XGBoost performs here better than MLP, and comparable as position-specific models.

## Features importance

In [None]:
result = permutation_importance(
    xgboost, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)

forest_importances = pd.Series(result.importances_mean, index=X_test.columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()