In [1]:
import pandas as pd
from mplsoccer import Sbopen

parser = Sbopen()

# Combine all matches in one dataframe

In [31]:
def get_all_match_ids(competition_id, season_id):
    """
    Get all match ids for a given competition and season.

    Parameters
    ----------
    competition_id: int
        The id of the competition
    season_id: int
        The id of the season

    Returns
    -------
    match_ids: list
        A list of all match ids for a given competition and season
    """
    df_all_matches = parser.match(competition_id=competition_id, season_id=season_id)
    match_ids = df_all_matches['match_id'].tolist()
    
    return match_ids

In [32]:
# Example usage
match_ids = get_all_match_ids(competition_id=55, season_id=282)

In [22]:
match_ids[0:10]

[3942819,
 3943043,
 3942752,
 3942382,
 3942349,
 3930180,
 3930171,
 3942227,
 3942226,
 3938645]

In [33]:

def load_all_events(match_ids):
    """
    Combine all events for all matches for a given competition and season.

    Parameters
    ----------
    match_ids: list
        A list of all match ids for a given competition and season

    Returns
    -------
    df_all_events: pd.DataFrame
        A dataframe with all Statsbomb event data for all matches of a given competition and season
    """
    
    # Init empty list to store all events
    all_events = []

    # Loop through all matches
    for match_id in match_ids:
        df_match = parser.event(match_id)[0]
        all_events.append(df_match)

    # Concatenate all events
    df_all_events = pd.concat(all_events, ignore_index=True)

    return df_all_events

In [25]:
# Example usage load all events
df_all_events = load_all_events(match_ids[0:5])

In [28]:
df_all_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18730 entries, 0 to 18729
Data columns (total 81 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              18730 non-null  object 
 1   index                           18730 non-null  int64  
 2   period                          18730 non-null  int64  
 3   timestamp                       18730 non-null  object 
 4   minute                          18730 non-null  int64  
 5   second                          18730 non-null  int64  
 6   possession                      18730 non-null  int64  
 7   duration                        13473 non-null  float64
 8   match_id                        18730 non-null  int64  
 9   type_id                         18730 non-null  int64  
 10  type_name                       18730 non-null  object 
 11  possession_team_id              18730 non-null  int64  
 12  possession_team_name            

# Calculate xG from shots

In [13]:
def calculate_xg_shots(df):
    """
    Calculate total xG from shots for players in a game

    Parameters
    ----------
    df: pd.DataFrame
        Dataframe with Wyscout event data for a certain game

    Returns
    -------
    df_xg_shots: pd.DataFrame
        Dataframe with xG from shots for players in a game
    """
    
    # Filter for shots
    shots_mask = (df['type_name'] == 'Shot')
    df_shots = df.loc[shots_mask, ['player_id', 'shot_statsbomb_xg']].copy()
    df_shots.rename(columns={'shot_statsbomb_xg': 'shots_xg'}, inplace=True)

    # Group by player and sum xG
    df_xg_shots = df_shots.groupby('player_id')['shots_xg'].sum().sort_values(ascending=False).reset_index()

    return df_xg_shots

In [14]:
# Usage example
df_example = calculate_xg_shots(df_all_events)
df_example.head(10)

Unnamed: 0,player_id,shots_xg
0,5207.0,4.536313
1,8966.0,3.778439
2,10955.0,3.053848
3,3009.0,2.506847
4,3193.0,2.31207
5,33234.0,2.025069
6,5204.0,2.005059
7,2988.0,1.951101
8,3289.0,1.751173
9,316046.0,1.587233


# Get player info

The events have the full player name as a value but we want the short name in our visualisations

In [40]:
# Get lineups info for a game
test = parser.lineup(3942819)
test.head()

Unnamed: 0,player_id,player_name,player_nickname,jersey_number,match_id,team_id,team_name,country_id,country_name
0,2988,Memphis Depay,Memphis Depay,10,3942819,941,Netherlands,160,Netherlands
1,3306,Nathan Aké,Nathan Aké,5,3942819,941,Netherlands,160,Netherlands
2,3311,Daley Blind,Daley Blind,17,3942819,941,Netherlands,160,Netherlands
3,3567,Georginio Wijnaldum,Georginio Wijnaldum,8,3942819,941,Netherlands,160,Netherlands
4,3669,Virgil van Dijk,Virgil van Dijk,4,3942819,941,Netherlands,160,Netherlands


In [41]:
def get_player_info(match_ids):
    """
    Get player info (id and short name) from the Statsbomb lineups data.

    Parameters
    ----------
    match_ids: list
        A list of all match ids for a given competition and season

    Returns
    -------
    df_player_info: pd.DataFrame
        A dataframe with all relevant player info.
    """

    # Init empty list to store all players
    all_players = []

    # Loop through all matches and add relevant player info to all_players
    for match_id in match_ids:
        df_lineup = parser.lineup(match_id)
        player_data = df_lineup[['player_id', 'player_name', 'player_nickname', 'team_name']]
        all_players.append(player_data)

    # Create dataframe with all players
    df_all_players = pd.concat(all_players, ignore_index=True)

    # Drop duplicates
    df_unique_players = df_all_players.drop_duplicates(subset=['player_id']) # Drop duplicates based on player_id

    return df_unique_players

In [43]:
# Example usage
players = get_player_info(match_ids)

In [46]:
players[players["team_name"] == "Spain"].head()

Unnamed: 0,player_id,player_name,player_nickname,team_name
52,3042,Mikel Merino Zazón,Mikel Merino,Spain
53,3265,José Luis Sanmartín Mato,Joselu,Spain
54,3477,Álvaro Borja Morata Martín,Álvaro Morata,Spain
55,4127,David Raya Martin,David Raya,Spain
56,4353,Aymeric Laporte,Aymeric Laporte,Spain


# Calculate player stats

Now we'll loop through all games in the tournament and use the previous functions to create a full player stats table

In [None]:
def create_player_stats(competition_id, season_id):
    """ 
    Create a dataframe with all player stats needed for the radar plot.

    Parameters
    ----------
    competition_id: int
        The id of the competition
    season_id: int
        The id of the season

    Returns
    -------
    df_player_stats: pd.DataFrame
        A dataframe with all relevant player stats.
    """

    # Get all match ids
    match_ids = get_all_match_ids(competition_id, season_id)

    # Combine all events for all matches
    df_all_events = load_all_events(match_ids)

    # Get all players in the tournament
    df_player_info = get_player_info(match_ids)

    # Init player stats dataframe based as a copy of the players info
    df_player_stats = df_player_info.copy()

    # Calculate xG from shots
    df_xg_shots = calculate_xg_shots(df_all_events)

    # Merge dataframes into df_player_stats
    df_player_stats = (
        df_player_stats
            .merge(df_xg_shots, on='player_id', how='left')
    )

    return df_player_stats

In [50]:
df_player_stats = create_player_stats(competition_id=55, season_id=282)

In [51]:
df_player_stats.sort_values(by='shots_xg', ascending=False).head(10)

Unnamed: 0,player_id,player_name,player_nickname,team_name,shots_xg
129,5207,Cristiano Ronaldo dos Santos Aveiro,Cristiano Ronaldo,Portugal,4.536313
242,8966,Kai Havertz,Kai Havertz,Germany,3.778439
42,10955,Harry Kane,Harry Kane,England,3.053848
77,3009,Kylian Mbappé Lottin,Kylian Mbappé,France,2.506847
125,3193,Bernardo Mota Veiga de Carvalho e Silva,Bernardo Silva,Portugal,2.31207
392,33234,Georges Mikautadze,Georges Mikautadze,Georgia,2.025069
126,5204,Bruno Miguel Borges Fernandes,Bruno Fernandes,Portugal,2.005059
0,2988,Memphis Depay,Memphis Depay,Netherlands,1.951101
258,3289,Romelu Lukaku Menama,Romelu Lukaku,Belgium,1.751173
75,316046,Lamine Yamal Nasraoui Ebana,Lamine Yamal,Spain,1.587233
