In [1]:
import pandas as pd
from mplsoccer import Sbopen

parser = Sbopen()

# Combine all matches in one dataframe

In [31]:
def get_all_match_ids(competition_id, season_id):
    """
    Get all match ids for a given competition and season.

    Parameters
    ----------
    competition_id: int
        The id of the competition
    season_id: int
        The id of the season

    Returns
    -------
    match_ids: list
        A list of all match ids for a given competition and season
    """
    df_all_matches = parser.match(competition_id=competition_id, season_id=season_id)
    match_ids = df_all_matches['match_id'].tolist()
    
    return match_ids

In [32]:
# Example usage
match_ids = get_all_match_ids(competition_id=55, season_id=282)

In [22]:
match_ids[0:10]

[3942819,
 3943043,
 3942752,
 3942382,
 3942349,
 3930180,
 3930171,
 3942227,
 3942226,
 3938645]

In [33]:

def load_all_events(match_ids):
    """
    Combine all events for all matches for a given competition and season.

    Parameters
    ----------
    match_ids: list
        A list of all match ids for a given competition and season

    Returns
    -------
    df_all_events: pd.DataFrame
        A dataframe with all Statsbomb event data for all matches of a given competition and season
    """
    
    # Init empty list to store all events
    all_events = []

    # Loop through all matches
    for match_id in match_ids:
        df_match = parser.event(match_id)[0]
        all_events.append(df_match)

    # Concatenate all events
    df_all_events = pd.concat(all_events, ignore_index=True)

    return df_all_events

In [83]:
# Example usage load all events
df_all_events = load_all_events(match_ids)

In [84]:
df_all_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187858 entries, 0 to 187857
Data columns (total 87 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              187858 non-null  object 
 1   index                           187858 non-null  int64  
 2   period                          187858 non-null  int64  
 3   timestamp                       187858 non-null  object 
 4   minute                          187858 non-null  int64  
 5   second                          187858 non-null  int64  
 6   possession                      187858 non-null  int64  
 7   duration                        136240 non-null  float64
 8   match_id                        187858 non-null  int64  
 9   type_id                         187858 non-null  int64  
 10  type_name                       187858 non-null  object 
 11  possession_team_id              187858 non-null  int64  
 12  possession_team_

# Calculate goals and assists

In [122]:
def calculate_goals_assists(df):
    """
    Calculate the number of goals and assists for each player.

    Parameters
    ----------
    df: pd.DataFrame
        A dataframe with all Statsbomb event data for all matches of a given competition and season.

    Returns
    -------
    df: pd.DataFrame
        A dataframe with the number of goals and assists for each player.
    """

    # Filter for goals
    goals_mask = ((df["outcome_id"] == 97) & (df["period"] != 5))       # goal id is 97, don't count penalties, own goals don't have a outcome_id
    df_goals = df.loc[goals_mask, ["player_id"]].copy()
    df_goals_count = df_goals.groupby("player_id").size().reset_index(name="goals")

    # Filter for assists
    assists_mask = (df["pass_goal_assist"] == True)
    df_assists = df.loc[assists_mask, ["player_id"]].copy()

    # Count assists per player
    df_assists_count = df_assists.groupby("player_id").size().reset_index(name="assists")

    # Merge goals and assists
    df_goals_assists = pd.merge(df_goals_count, df_assists_count, on="player_id", how="outer")

    # Fill missing values with 0
    df_goals_assists["goals"] = df_goals_assists["goals"].fillna(0)
    df_goals_assists["assists"] = df_goals_assists["assists"].fillna(0)

    # Set goals and assists to int
    df_goals_assists["goals"] = df_goals_assists["goals"].astype(int)
    df_goals_assists["assists"] = df_goals_assists["assists"].astype(int)

    return df_goals_assists

In [123]:
# Example usage
df_example = calculate_goals_assists(df_all_events)

In [124]:
df_example.sort_values(by="goals", ascending=False).head(10)

Unnamed: 0,player_id,goals,assists
98,33234.0,3,1
62,10955.0,3,0
74,16532.0,3,2
78,20750.0,3,1
89,28032.0,3,0
103,39565.0,3,0
24,5545.0,2,0
54,8966.0,2,1
70,15582.0,2,0
73,16344.0,2,0


# Calculate shots and total xG

In [156]:
def calculate_shots_xg(df):
    """
    Calculate number of shots and total xG.

    Parameters
    ----------
    df: pd.DataFrame
        Dataframe with Wyscout event data for a certain game.

    Returns
    -------
    df_xg_shots: pd.DataFrame
        Dataframe with xG from shots for players in a game.
    """
    
    # Filter for shots
    shots_mask = (df['type_name'] == 'Shot')
    df_shots = df.loc[shots_mask, ['player_id', 'type_name', 'shot_statsbomb_xg']].copy()
    df_shots.rename(columns={'shot_statsbomb_xg': 'shots_xg'}, inplace=True)

    # Group by player and count shots and sum xG
    df_xg_shots = df_shots.groupby('player_id').agg({'type_name': 'count', 'shots_xg': 'sum'}).reset_index()
    df_xg_shots.rename(columns={'type_name': 'shots'}, inplace=True)

    return df_xg_shots

In [157]:
# Usage example
df_example = calculate_shots_xg(df_all_events)
df_example.head(10)

Unnamed: 0,player_id,shots,shots_xg
0,2954.0,4,0.153319
1,2972.0,12,1.178654
2,2988.0,18,1.951101
3,3009.0,24,2.506847
4,3026.0,3,0.26204
5,3042.0,3,0.326048
6,3043.0,12,0.835043
7,3053.0,7,0.282124
8,3076.0,3,0.323953
9,3077.0,1,0.104913


# Get player info

The events have the full player name as a value but we want the short name in our visualisations

In [40]:
# Get lineups info for a game
test = parser.lineup(3942819)
test.head()

Unnamed: 0,player_id,player_name,player_nickname,jersey_number,match_id,team_id,team_name,country_id,country_name
0,2988,Memphis Depay,Memphis Depay,10,3942819,941,Netherlands,160,Netherlands
1,3306,Nathan Aké,Nathan Aké,5,3942819,941,Netherlands,160,Netherlands
2,3311,Daley Blind,Daley Blind,17,3942819,941,Netherlands,160,Netherlands
3,3567,Georginio Wijnaldum,Georginio Wijnaldum,8,3942819,941,Netherlands,160,Netherlands
4,3669,Virgil van Dijk,Virgil van Dijk,4,3942819,941,Netherlands,160,Netherlands


In [168]:
def get_player_info(match_ids):
    """
    Get player info (id and short name) from the Statsbomb lineups data.

    Parameters
    ----------
    match_ids: list
        A list of all match ids for a given competition and season

    Returns
    -------
    df_player_info: pd.DataFrame
        A dataframe with all relevant player info.
    """

    # Init empty list to store all players
    all_players = []

    # Loop through all matches and add relevant player info to all_players
    for match_id in match_ids:
        df_lineup = parser.lineup(match_id)
        player_data = df_lineup[['player_id', 'player_name', 'player_nickname', 'team_name']]
        all_players.append(player_data)

    # Create dataframe with all players
    df_all_players = pd.concat(all_players, ignore_index=True)

    # Rename nickname to short_name
    df_all_players.rename(columns={'player_nickname': 'player_short_name'}, inplace=True)

    # Drop duplicates
    df_unique_players = df_all_players.drop_duplicates(subset=['player_id']) # Drop duplicates based on player_id

    return df_unique_players

In [169]:
# Example usage
players = get_player_info(match_ids)

In [170]:
players[players["team_name"] == "Spain"].head()

Unnamed: 0,player_id,player_name,player_short_name,team_name
52,3042,Mikel Merino Zazón,Mikel Merino,Spain
53,3265,José Luis Sanmartín Mato,Joselu,Spain
54,3477,Álvaro Borja Morata Martín,Álvaro Morata,Spain
55,4127,David Raya Martin,David Raya,Spain
56,4353,Aymeric Laporte,Aymeric Laporte,Spain


# Calculate player stats

Now we'll loop through all games in the tournament and use the previous functions to create a full player stats table

In [171]:
def create_player_stats(competition_id, season_id):
    """ 
    Create a dataframe with all player stats needed for the radar plot.

    Parameters
    ----------
    competition_id: int
        The id of the competition
    season_id: int
        The id of the season

    Returns
    -------
    df_player_stats: pd.DataFrame
        A dataframe with all relevant player stats.
    """

    # Get all match ids
    match_ids = get_all_match_ids(competition_id, season_id)

    # Combine all events for all matches
    df_all_events = load_all_events(match_ids)

    # Get all players in the tournament
    df_player_info = get_player_info(match_ids)

    # Init player stats dataframe based as a copy of the players info
    df_player_stats = df_player_info.copy()

    # Calculate goals and assists
    df_goals_assists = calculate_goals_assists(df_all_events)

    # Calculate xG from shots
    df_xg_shots = calculate_shots_xg(df_all_events)

    # Merge dataframes into df_player_stats
    df_player_stats = (
        df_player_stats
            .merge(df_goals_assists, on='player_id', how='left')
            .merge(df_xg_shots, on='player_id', how='left')
    )

    # Fill missing values with 0
    df_player_stats.fillna(0, inplace=True)             # Each column that can be empty is a number, so we can use fillna with 0
    
    # Correct column types
    df_player_stats["goals"] = df_player_stats["goals"].astype(int)
    df_player_stats["assists"] = df_player_stats["assists"].astype(int)
    df_player_stats["shots"] = df_player_stats["shots"].astype(int)


    return df_player_stats

In [172]:
df_player_stats = create_player_stats(competition_id=55, season_id=282)

In [173]:
df_player_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 621 entries, 0 to 620
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   player_id          621 non-null    int64  
 1   player_name        621 non-null    object 
 2   player_short_name  621 non-null    object 
 3   team_name          621 non-null    object 
 4   goals              621 non-null    int64  
 5   assists            621 non-null    int64  
 6   shots              621 non-null    int64  
 7   shots_xg           621 non-null    float64
dtypes: float64(1), int64(4), object(3)
memory usage: 38.9+ KB


In [174]:
df_player_stats.sort_values(by='shots', ascending=False).head(10)

Unnamed: 0,player_id,player_name,player_short_name,team_name,goals,assists,shots,shots_xg
129,5207,Cristiano Ronaldo dos Santos Aveiro,Cristiano Ronaldo,Portugal,0,1,25,4.536313
77,3009,Kylian Mbappé Lottin,Kylian Mbappé,France,1,1,24,2.506847
242,8966,Kai Havertz,Kai Havertz,Germany,2,1,19,3.778439
0,2988,Memphis Depay,Memphis Depay,Netherlands,1,1,18,1.951101
42,10955,Harry Kane,Harry Kane,England,3,0,18,3.053848
75,316046,Lamine Yamal Nasraoui Ebana,Lamine Yamal,Spain,1,4,18,1.587233
66,16532,Daniel Olmo Carvajal,Daniel Olmo,Spain,3,2,17,1.378103
59,6655,Fabián Ruiz Peña,Fabián Ruiz,Spain,2,2,17,1.154685
224,45190,Dan Ndoye,Dan Ndoye,Switzerland,1,0,14,1.240358
54,3477,Álvaro Borja Morata Martín,Álvaro Morata,Spain,1,0,14,1.566969


In [175]:
df_player_stats[df_player_stats["player_name"] == "Jeremy Doku"]

Unnamed: 0,player_id,player_name,player_short_name,team_name,goals,assists,shots,shots_xg
267,23650,Jeremy Doku,Jérémy Doku,Belgium,0,0,5,0.255422
