**João Lucas Lage Gonçalves**

**23052002**

# Descrição


# Carregar os dados

In [3]:
!pip install socceraction
!pip install statsbombpy
!pip install --upgrade bottleneck

Collecting bottleneck
  Obtaining dependency information for bottleneck from https://files.pythonhosted.org/packages/76/ab/3e95d162d356c853b7c0c084871900d5bdce5e9ad5479396d9641c2dee99/Bottleneck-1.4.0-cp311-cp311-win_amd64.whl.metadata
  Downloading Bottleneck-1.4.0-cp311-cp311-win_amd64.whl.metadata (8.1 kB)
Downloading Bottleneck-1.4.0-cp311-cp311-win_amd64.whl (111 kB)
   ---------------------------------------- 0.0/111.6 kB ? eta -:--:--
   --- ------------------------------------ 10.2/111.6 kB ? eta -:--:--
   ---------- ---------------------------- 30.7/111.6 kB 435.7 kB/s eta 0:00:01
   -------------------------------------- 111.6/111.6 kB 921.6 kB/s eta 0:00:00
Installing collected packages: bottleneck
  Attempting uninstall: bottleneck
    Found existing installation: Bottleneck 1.3.5
    Uninstalling Bottleneck-1.3.5:
      Successfully uninstalled Bottleneck-1.3.5
Successfully installed bottleneck-1.4.0


In [5]:
from statsbombpy import sb
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import brier_score_loss, log_loss, roc_auc_score, accuracy_score
import xgboost as xgb

import socceraction.spadl as spadl
from socceraction.data.wyscout import PublicWyscoutLoader
from socceraction.vaep import features as ft
import socceraction.vaep.labels as lb
import socceraction.vaep.formula as vaepformula


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
DATA_DIR = 'data'
WYL = PublicWyscoutLoader(root=DATA_DIR)
TRAIN_COMPETITIONS = ['German first division', 'Italian first division', 'French first division', 'Spanish first division', 'English first division']
TEST_COMPETITIONS = ['World Cup', 'European Championship']

competitions = WYL.competitions()

train_competions = competitions[competitions.competition_name.isin(TRAIN_COMPETITIONS)]
test_competions = competitions[competitions.competition_name.isin(TEST_COMPETITIONS)]

train_games = pd.concat([
    WYL.games(competition_id=competition.competition_id, season_id=competition.season_id)
    for competition in train_competions.itertuples()
])

test_games = pd.concat([
    WYL.games(competition_id=competition.competition_id, season_id=competition.season_id)
    for competition in test_competions.itertuples()
])

test_games.head()


Unnamed: 0,game_id,competition_id,season_id,game_date,game_day,home_team_id,away_team_id
0,1694440,102,9291,2016-07-10 19:00:00,0,9905,4418
1,1694439,102,9291,2016-07-07 19:00:00,0,3148,4418
2,1694438,102,9291,2016-07-06 19:00:00,0,9905,10682
3,1694437,102,9291,2016-07-03 19:00:00,0,4418,7839
4,1694436,102,9291,2016-07-02 19:00:00,0,3148,3757


In [7]:
games_verbose = list(train_games.itertuples())

training_actions = []
for game in tqdm(games_verbose, desc="Converting training games to SPADL ({} games)".format(len(games_verbose)), total=len(games_verbose)):
    events = WYL.events(game.game_id)
    events = events.rename(columns={'id': 'event_id', 'eventId': 'type_id', 'subEventId': 'subtype_id',
                            'teamId': 'team_id', 'playerId': 'player_id', 'matchId': 'game_id'})
    actions_game = spadl.wyscout.convert_to_actions(events, game.home_team_id)
    actions_game = spadl.play_left_to_right(actions=actions_game, home_team_id=game.home_team_id)
    actions_game = spadl.add_names(actions_game)
    actions_game['home_team_id'] = game.home_team_id
    training_actions.append(actions_game)

training_df = pd.concat(training_actions).reset_index(drop=True)

Converting training games to SPADL (1826 games):  10%|▉         | 179/1826 [02:38<24:22,  1.13it/s]


KeyboardInterrupt: 

In [None]:
games_verbose = list(test_games.itertuples())

test_actions = []
for game in tqdm(games_verbose, desc="Converting test games to SPADL ({} games)".format(len(games_verbose)), total=len(games_verbose)):
    events = WYL.events(game.game_id)
    events = events.rename(columns={'id': 'event_id', 'eventId': 'type_id', 'subEventId': 'subtype_id',
                            'teamId': 'team_id', 'playerId': 'player_id', 'matchId': 'game_id'})
    actions_game = spadl.wyscout.convert_to_actions(events, game.home_team_id)
    actions_game = spadl.play_left_to_right(actions=actions_game, home_team_id=game.home_team_id)
    actions_game = spadl.add_names(actions_game)
    actions_game['home_team_id'] = game.home_team_id
    test_actions.append(actions_game)

test_df = pd.concat(test_actions).reset_index(drop=True)

In [None]:
players = pd.read_json(DATA_DIR+'\players.json')
players.head()

In [None]:
teams = pd.read_json(DATA_DIR+'\\teams.json')
teams.head()

# Tratamento de dados

In [None]:
test_games['game_date'] = pd.to_datetime(test_games['game_date'])
train_games['game_date'] = pd.to_datetime(train_games['game_date'])

test_date = test_games['game_date'].min()
train_date = train_games['game_date'].min()

print(f"Train date: {train_date}")
print(f"Test date: {test_date}")

In [None]:
players['player_name'] = players['firstName'] + ' ' + players['lastName']
players['player_country'] = players['passportArea'].apply(lambda x: x['name'] if x is not None else None)
players['birthDate'] = pd.to_datetime(players['birthDate'])
players['train_age'] = (train_date - players['birthDate']).dt.days // 365
players['test_age'] = (test_date - players['birthDate']).dt.days // 365

players['player_name'] = players['player_name'].str.decode('unicode-escape')
players['player_country'] = players['player_country'].str.decode('unicode-escape')
players['shortName'] = players['shortName'].str.decode('unicode-escape')
players = players.rename(columns={'wyId': 'player_id', 'currentTeamId': 'team_id'})

players = players[['player_id', 'player_name', 'shortName', 'player_country', 'train_age', 'test_age', 'team_id']]

players.head()

In [None]:
teams['team_country'] = teams['area'].apply(lambda x: x['name'] if x is not None else None)
teams['name'] = teams['name'].str.decode('unicode-escape')
teams['team_country'] = teams['team_country'].str.decode('unicode-escape')

teams = teams.rename(columns={'wyId': 'team_id', 'name': 'team_name'})

teams = teams[['team_id', 'team_name', 'team_country']]

teams.head()

# Feature Engeneering

In [None]:
def createFeatures(actions):
    xfns = [
        ft.actiontype_onehot,
        ft.bodypart_onehot,
        ft.result_onehot,
        ft.goalscore,
        ft.startlocation,
        ft.endlocation,
        ft.movement,
        ft.space_delta,
        ft.startpolar,
        ft.endpolar,
        ft.team,
        ft.time,
        ft.time_delta
    ]

    features = []
    for game in tqdm(actions.game_id.unique(), desc="Creating features"):
        actions_game = actions[actions.game_id==game].reset_index(drop=True)
        match_states = ft.gamestates(actions=actions_game)
        match_features = pd.concat([fn(match_states) for fn in xfns], axis=1)
        features.append(match_features)

    features = pd.concat(features).reset_index(drop=True)
    return features

In [None]:
training_features = createFeatures(training_df)
test_features = createFeatures(test_df)

test_features.head()

# Gerando Labels

In [None]:
def createLabels(actions):
    yfns = [
        lb.scores,
        lb.concedes
    ]

    labels = []
    for game in tqdm(actions.game_id.unique(), desc="Creating labels"):
        actions_game = actions[actions.game_id==game].reset_index(drop=True)
        labels.append(pd.concat([fn(actions=actions_game) for fn in yfns], axis=1))

    labels = pd.concat(labels).reset_index(drop=True)
    return labels

In [None]:
training_labels = createLabels(actions=training_df)
test_labels = createLabels(actions=test_df)

test_labels.head()

# Model Creation   

In [None]:
class XGBoostClassifier():
    def __init__(self, n_estimators, max_depth, n_jobs, verbosity):
        self.model = xgb.XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=n_jobs, verbosity=verbosity)
    
    def fit(self, X, y):    
        self.model.fit(X, y)
    
    def predict_proba(self, X):
        return self.model.predict_proba(X)[:, 1].reshape(-1, 1)
    
    def predict(self, X):
        return self.model.predict(X).reshape(-1, 1)
    
    def evaluate(self, X, y):
        y_pred = self.predict(X)
        y_proba = self.predict_proba(X)
        y_true = y.values
        
        metrics = {}
        metrics = {
            "brier": brier_score_loss(y_true, y_proba),
            "log_loss": log_loss(y_true, y_proba),
            "roc_auc": roc_auc_score(y_true, y_proba),
            "accuracy": accuracy_score(y_true, y_pred)
        }
        return metrics

# Execução

In [None]:
list_n_esn_estimators = [25,50,100]
list_max_depth = [3,5,9]
verbosity=1
n_jobs=-3

In [None]:
metrics_list = []

In [None]:
for n_estimators in list_n_esn_estimators:
    for max_depth in list_max_depth:
        for label in test_labels.columns:

            model_path = f"./models/xgbc_{label}_est{n_estimators}_dph{max_depth}.pkl"

            if os.path.exists(model_path):
                xgbc = XGBoostClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=n_jobs, verbosity=verbosity)
                xgbc.model.load_model(model_path)
                metrics = xgbc.evaluate(test_features, test_labels[label])
            else:
                xgbc = XGBoostClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=n_jobs, verbosity=verbosity)
                xgbc.fit(training_features, training_labels[label])
                metrics = xgbc.evaluate(test_features, test_labels[label])

                #xgbc.model.save_model(f"xgbc_{label}_est{n_estimators}_dph{max_depth}.pkl")

            metrics_list.append({
                "model": f"xgbc_{label}_est{n_estimators}_dph{max_depth}",
                "brier": metrics["brier"],
                "log_loss": metrics["log_loss"],
                "roc_auc": metrics["roc_auc"],
                "accuracy": metrics["accuracy"]
            })       

# Avaliação

In [None]:
# plot each metric in a bar chart
metrics_df = pd.DataFrame(metrics_list)

scores_metrics = metrics_df[metrics_df.model.str.contains('scores')].reset_index(drop=True)
concedes_metrics = metrics_df[metrics_df.model.str.contains('concedes')].reset_index(drop=True)

scores_metrics


In [None]:
def plot_metrics(df, label, metric):
    plt.figure(figsize=(14, 7))

    color = "blue" if label=="scores" else "orange"

    sns.barplot(x="model", y=metric, data=df, color=color, label=label)
    plt.xticks(rotation=90)
    plt.title(f'Comparison of {metric} between Models')


    plt.yscale('log', base=10)

    plt.legend()
    plt.show()

In [None]:
metrics = ["brier", "log_loss", "roc_auc", "accuracy"]

for metric in metrics:
    plot_metrics(scores_metrics, "scores", metric)
    plot_metrics(concedes_metrics, "concedes", metric)

In [None]:
metrics = ["brier", "log_loss", "roc_auc", "accuracy"]

for metric in metrics:
    ascending = True if metric=="brier" or metric=="log_loss" else False
    top3 = scores_metrics.sort_values(by=metric).reset_index(drop=True).head(3)
    print(f"Top 3 models for {metric}")
    print(top3['model'])

In [None]:
metrics = ["brier", "log_loss", "roc_auc", "accuracy"]

for metric in metrics:
    ascending = True if metric=="brier" or metric=="log_loss" else False
    top3 = concedes_metrics.sort_values(by=metric).reset_index(drop=True).head(3)
    print(f"Top 3 models for {metric}")
    print(top3['model'])

In [None]:
selected_params =  {
        "n_estimators": 100,
        "max_depth": 9,
        "n_jobs": -3,
        "verbosity": 1
}

models = {}
features = training_features.columns

features_df = pd.DataFrame()

for label in test_labels.columns:
        model_path = f"./models/xgbc_{label}_est{n_estimators}_dph{max_depth}.pkl"
        xgbc = XGBoostClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=n_jobs, verbosity=verbosity)
        xgbc.model.load_model(model_path)

        models[label] = xgbc

        metrics = xgbc.evaluate(test_features, test_labels[label])
        print(f"Metrics for {label}")
        print(metrics)

        print("##################################################################################")

        features_importance =  xgbc.model.feature_importances_
        
        feature_importance_df = pd.DataFrame({'label':label, 'feature': features, 'importance': features_importance})
        features_df = pd.concat([features_df, feature_importance_df])

In [None]:
scores_df = features_df[features_df['label'] == 'scores']
concedes_df = features_df[features_df['label'] == 'concedes']

scores_df = scores_df.sort_values(by='importance', ascending=False)
concedes_df = concedes_df.sort_values(by='importance', ascending=False)

def plot_feature_importance(df, title):
    plt.figure(figsize=(10, 6))
    sns.barplot(x='importance', y='feature', data=df)
    plt.title(title)
    plt.xlabel('Importância')
    plt.ylabel('Feature')
    plt.show()

plot_feature_importance(scores_df.head(10), 'Importância das Features - Scores')
plot_feature_importance(concedes_df.head(10), 'Importância das Features - Concedes')

# Calculo do VAEP

In [None]:

def calculateVaep(models, actions):
    
    predictions = {}
    for model in tqdm(['scores', 'concedes'], desc="Predicting scores and concedes"):
        predictions[model] = models[model].predict_proba(test_features)[:, 0]
    
    predictions = pd.DataFrame(predictions)
    predictions = vaepformula.value(actions, predictions['scores'], predictions['concedes'])
    return predictions

In [None]:
preds = calculateVaep(models= models, actions=test_df)

actions_vaep = pd.concat([test_df, preds], axis=1).reset_index(drop=True)
actions_vaep = actions_vaep.sort_values(by=['game_id','period_id','time_seconds'])
actions_vaep

# Explorando os resultados

In [None]:
actions_vaep.sort_values(by='vaep_value', ascending=False).head(10)

In [None]:
actions_vaep.sort_values(by='offensive_value', ascending=False).head(10)

In [None]:
actions_vaep.sort_values(by='offensive_value', ascending=True).head(10)

In [None]:
actions_vaep.sort_values(by='defensive_value', ascending=False).head(10)

In [None]:
actions_vaep.sort_values(by='defensive_value', ascending=True).head(10)

In [None]:
df_vaep = actions_vaep[['game_id','type_name','team_id', 'player_id','vaep_value', 'offensive_value', 'defensive_value']]
df_vaep

In [None]:
vaep_players = df_vaep.groupby(['player_id','team_id']).sum().reset_index()

vaep_players = vaep_players.merge(players, how='left', on='player_id', suffixes=('', '_player'))
vaep_players = vaep_players.drop(columns=['team_id_player'])

game_counts = df_vaep.groupby('player_id')['game_id'].nunique().reset_index()
game_counts = game_counts.rename(columns={'game_id': 'games_played'})

vaep_players = pd.merge(vaep_players, game_counts, on='player_id')

vaep_players['vaep_per_game'] = vaep_players['vaep_value'] / vaep_players['games_played']

vaep_players = vaep_players.sort_values(by=['vaep_per_game','vaep_value'], ascending=False).reset_index(drop=True)
vaep_players[vaep_players['games_played'] > 5].head(10)

In [None]:
vaep_teams = df_vaep.groupby(['team_id']).sum().reset_index()

vaep_teams = vaep_teams.merge(teams, how='left', on='team_id', suffixes=('', '_team'))
#vaep_teams = vaep_teams.drop(columns=['team_id_team'])

game_counts = df_vaep.groupby('team_id')['game_id'].nunique().reset_index()
game_counts = game_counts.rename(columns={'game_id': 'games_played'})

vaep_teams = pd.merge(vaep_teams, game_counts, on='team_id')

vaep_teams['vaep_per_game'] = vaep_teams['vaep_value'] / vaep_teams['games_played']

vaep_teams = vaep_teams.sort_values(by=['vaep_per_game','vaep_value'], ascending=False).reset_index(drop=True)
vaep_teams[vaep_teams['games_played'] > 5].head(10)

In [None]:
vaep_actions = df_vaep.groupby(['type_name']).sum().reset_index()

action_counts = df_vaep.groupby('type_name').size().reset_index(name='action_count')
vaep_actions = pd.merge(vaep_actions, action_counts, on='type_name')

vaep_actions['vaep_per_game'] = vaep_actions['vaep_value'] / vaep_actions['action_count']

vaep_actions = vaep_actions.sort_values(by=['vaep_per_game','vaep_value'], ascending=False).reset_index(drop=True)
vaep_actions[vaep_teams['games_played'] > 5].head(10)