In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mplsoccer import Pitch, Sbopen, VerticalPitch
from matplotlib.text import Text

In [54]:
# Get Euros 2024 games
parser = Sbopen()
df_games = parser.match(competition_id=55, season_id=282)
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 52 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   match_id                         51 non-null     int64         
 1   match_date                       51 non-null     datetime64[ns]
 2   kick_off                         51 non-null     datetime64[ns]
 3   home_score                       51 non-null     int64         
 4   away_score                       51 non-null     int64         
 5   match_status                     51 non-null     object        
 6   match_status_360                 51 non-null     object        
 7   last_updated                     51 non-null     datetime64[ns]
 8   last_updated_360                 51 non-null     datetime64[ns]
 9   match_week                       51 non-null     int64         
 10  competition_id                   51 non-null     int64         


In [81]:
def get_dribbles_and_playing_time_df(match_id): 
    df = parser.event(match_id)[0]

    # Get dribbles
    dribbles_mask = (df['type_name'] == 'Dribble')
    df_dribbles = df.loc[dribbles_mask]

    # Get players who made dribbles and init playing time df
    df_playing_time = df_dribbles[['match_id', 'player_name', 'team_name']].drop_duplicates()
    df_playing_time['playing_time'] = 0

    # Get first and second half extra time
    df_end = df[df['type_name'] == 'Half End']
    end_first_half = df_end.iloc[0]['minute'] * 60 + df_end.iloc[0]['second']
    first_half_extra_time = end_first_half - (45 * 60)
    end_second_half = df_end.iloc[-1]['minute'] * 60 + df_end.iloc[-1]['second']
    second_half_extra_time = end_second_half - (90 * 60)
    full_game_time = (90 * 60) + first_half_extra_time + second_half_extra_time

    # Get all substitutions
    subs_mask = (df['type_name'] == 'Substitution')
    df_subs = df.loc[subs_mask, ['type_name', 'period', 'minute', 'second', 'player_name', 'team_name', 'sub_type_name', 'substitution_replacement_name']]

    # Update playing time for players and subs
    for i, player in df_playing_time.iterrows():
        player_name = player['player_name']
        playing_time = 0
        
        # Case 1: Player played the full game (not in any substitution list)
        if player_name not in pd.concat([df_subs['player_name'], df_subs['substitution_replacement_name']]).values:
            playing_time = full_game_time
            
        # Case 2: Player was both subbed in and out
        elif (player_name in df_subs['player_name'].values) and (player_name in df_subs['substitution_replacement_name'].values):
            player_in = df_subs[df_subs['substitution_replacement_name'] == player_name]
            player_out = df_subs[df_subs['player_name'] == player_name]

            # Player was subbed in and out in the same half
            if player_in['period'].iloc[0] == player_out['period'].iloc[0]:
                playing_time = player_out['minute'].iloc[0] * 60 + player_out['second'].iloc[0] - (player_in['minute'].iloc[0] * 60 + player_in['second'].iloc[0])
            
            # Player was subbed in in the first half and out in the second half
            elif player_in['period'].iloc[0] == 1:
                playing_time = (player_out['minute'].iloc[0] * 60 + player_out['second'].iloc[0]) - (player_in['minute'].iloc[0] * 60 + player_in['second'].iloc[0]) + first_half_extra_time
            
        # Case 3: Player was subbed out
        elif player_name in df_subs['player_name'].values:
            player_out = df_subs[df_subs['player_name'] == player_name]
            # Player was subbed out in the first half
            if player_out['period'].iloc[0] == 1:
                playing_time = player_out['minute'].iloc[0] * 60 + player_out['second'].iloc[0]
            
            # Player was subbed out in the second half
            elif player_out['period'].iloc[0] == 2:
                playing_time = first_half_extra_time + (player_out['minute'].iloc[0] * 60 + player_out['second'].iloc[0])


        # Case 4: Player was subbed in and finished the game
        elif player_name in df_subs['substitution_replacement_name'].values:
            player_in = df_subs[df_subs['substitution_replacement_name'] == player_name]

            # Player was subbed in in the first half
            if player_in['period'].iloc[0] == 1:
                playing_time = full_game_time - (player_in['minute'].iloc[0] * 60 + player_in['second'].iloc[0])

            # Player was subbed in in the second half
            elif player_in['period'].iloc[0] == 2:
                playing_time = full_game_time - first_half_extra_time - (player_in['minute'].iloc[0] * 60 + player_in['second'].iloc[0])

        df_playing_time.loc[
            (df_playing_time['match_id'] == match_id) & 
            (df_playing_time['player_name'] == player_name), 
            'playing_time'
        ] = playing_time

    return df_dribbles.reset_index(drop=True), df_playing_time.reset_index(drop=True)

In [84]:
match_id = 3942819
test = get_dribbles_and_playing_time_df(match_id)
test[1].head(20)

Unnamed: 0,match_id,player_name,team_name,playing_time
0,3942819,Kyle Walker,England,5899
1,3942819,Phil Foden,England,4958
2,3942819,Jude Bellingham,England,5899
3,3942819,Cody Mathès Gakpo,Netherlands,5899
4,3942819,Denzel Dumfries,Netherlands,5752
5,3942819,Xavi Simons,Netherlands,5750
6,3942819,Nathan Aké,Netherlands,5899
7,3942819,Harry Kane,England,4971
8,3942819,Wout Weghorst,Netherlands,3009
9,3942819,Declan Rice,England,5899


In [89]:
# Create empty dribbles dataframe
driblles_columns = ['match_id', 'period', 'timestamp', 'minute', 'second', 'possession', 'duration', 'match_id', 'type_name', 'outcome_name', 'team_name', 'player_name', 'x', 'y']
df_dribbles = pd.DataFrame(columns=driblles_columns)

# Create empty players dataframe to collect player names and minutes played
players_columns = ['match_id', 'player_name', 'team_name', 'playing_time']
df_players = pd.DataFrame(columns=players_columns)

# Loop through matches
for match_id in df_games['match_id']:
    dribbles_df, playing_time_df = get_dribbles_and_playing_time_df(match_id)

    df_dribbles = pd.concat([df_dribbles, dribbles_df[driblles_columns]], ignore_index=True)
    df_players = pd.concat([df_players, playing_time_df[players_columns]], ignore_index=True)

  df_dribbles = pd.concat([df_dribbles, dribbles_df[driblles_columns]], ignore_index=True)


In [92]:
df_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 645 entries, 0 to 644
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   match_id      645 non-null    object
 1   player_name   645 non-null    object
 2   team_name     645 non-null    object
 3   playing_time  645 non-null    object
dtypes: object(4)
memory usage: 20.3+ KB


In [93]:
df_dribbles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1280 entries, 0 to 1279
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   match_id      1280 non-null   object 
 1   period        1280 non-null   object 
 2   timestamp     1280 non-null   object 
 3   minute        1280 non-null   object 
 4   second        1280 non-null   object 
 5   possession    1280 non-null   object 
 6   duration      1280 non-null   float64
 7   match_id      1280 non-null   object 
 8   type_name     1280 non-null   object 
 9   outcome_name  1280 non-null   object 
 10  team_name     1280 non-null   object 
 11  player_name   1280 non-null   object 
 12  x             1280 non-null   float64
 13  y             1280 non-null   float64
dtypes: float64(3), object(11)
memory usage: 140.1+ KB
