In [1]:
# Statsbompy is a python package that allows you to access the StatsBomb API
# The StatsBomb API is a free API that provides access to football data
# can be installed using pip install statsbombpy
import pandas as pd
import numpy as np
from statsbombpy import sb
# Better visiblity function
import warnings
from statsbombpy.api_client import NoAuthWarning
# Stop the no auth warning
warnings.filterwarnings("ignore", category=NoAuthWarning)
# show all pd columns
pd.set_option('display.max_columns', None)

In [2]:
competitions = sb.competitions()
competitions = competitions[competitions['competition_gender']=='male'].sort_values('season_name',ascending=False).head(27)
competitions = competitions[competitions['competition_name']!='Indian Super league'].reset_index(drop=True)

In [3]:
# drop unnecessary columns
cols_c = ['competition_youth','match_updated','competition_gender',
          'match_updated_360','match_available_360','match_available']

# droping the columns in col
competitions.drop(cols_c,axis=1,inplace=True)

In [4]:
competitions.head()

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_international,season_name
0,223,282,South America,Copa America,True,2024
1,55,282,Europe,UEFA Euro,True,2024
2,9,281,Germany,1. Bundesliga,False,2023/2024
3,44,107,United States of America,Major League Soccer,False,2023
4,1267,107,Africa,African Cup of Nations,True,2023


In [5]:
competitions.to_csv('competitions.csv',index=False)

In [5]:
df_matches = pd.DataFrame()

for i in range(0, len(competitions)):
    c, s = competitions.loc[i,
                            ['competition_id',
                             'season_id']]
    if df_matches.empty:
        df_matches = sb.matches(competition_id=c,
                                season_id=s)
    else:
        df_matches = pd.concat([df_matches,
                                sb.matches(competition_id=c,
                                           season_id=s)])
df_matches.reset_index(drop=True,
                       inplace=True)

df_matches.head()

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,match_status_360,last_updated,last_updated_360,match_week,competition_stage,stadium,referee,home_managers,away_managers,data_version,shot_fidelity_version,xy_fidelity_version
0,3943077,2024-07-15,04:15:00.000,South America - Copa America,2024,Argentina,Colombia,1,0,available,unscheduled,2024-07-15T15:50:08.671355,,6,Final,Hard Rock Stadium,Raphael Claus,Lionel Sebastián Scaloni,Néstor Gabriel Lorenzo,1.1.0,2,2
1,3943076,2024-07-14,03:00:00.000,South America - Copa America,2024,Canada,Uruguay,2,2,available,unscheduled,2024-07-15T07:57:02.660641,,6,3rd Place Final,Bank of America Stadium,Alexis Herrera,Jesse Marsch,Marcelo Alberto Bielsa Caldera,1.1.0,2,2
2,3942852,2024-07-11,03:00:00.000,South America - Copa America,2024,Uruguay,Colombia,0,1,available,unscheduled,2024-07-15T18:00:33.653673,,5,Semi-finals,Bank of America Stadium,César Arturo Ramos Palazuelos,Marcelo Alberto Bielsa Caldera,Néstor Gabriel Lorenzo,1.1.0,2,2
3,3942785,2024-07-10,03:00:00.000,South America - Copa America,2024,Argentina,Canada,2,0,available,unscheduled,2024-07-14T15:55:49.351182,,5,Semi-finals,MetLife Stadium,Piero Maza Gomez,Lionel Sebastián Scaloni,Jesse Marsch,1.1.0,2,2
4,3942416,2024-07-07,01:00:00.000,South America - Copa America,2024,Colombia,Panama,5,0,available,unscheduled,2024-07-10T06:49:40.099252,,4,Quarter-finals,State Farm Stadium,Maurizio Mariani,Néstor Gabriel Lorenzo,Thomas Christiansen Tarín,1.1.0,2,2


In [6]:
# drop columns
cols_m = ['match_status','match_status_360','last_updated','last_updated_360',
          'shot_fidelity_version','xy_fidelity_version']
df_matches.drop(cols_m,axis=1,inplace=True)

In [12]:
df_matches.to_csv('matches.csv',index=False)

In [55]:
def concat_lineups(lineups_dict,mid):
    df = pd.DataFrame()
    for club in lineups_dict.keys():
        df = pd.concat([df,lineups_dict[club]])
        df['match_id'] = mid
    return df

In [59]:
df_lineups = pd.DataFrame()

for i in range(0, len(df_matches)):
    mid = df_matches.loc[i,'match_id']
    if df_lineups.empty:
        df = sb.lineups(mid)
        df_lineups = concat_lineups(df,mid)
    else:
        df = sb.lineups(mid)
        df_lineups = pd.concat([df_lineups,
                                concat_lineups(df,mid)])
    if i%50==0:
        print(i)

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400


In [60]:
df_lineups.reset_index(drop=True,                              
                     inplace=True)

In [70]:
df_lineups.to_csv('lineups.csv',index=False)

In [8]:
# save in csv file for better readability
std_cols =['50_50', 'ball_receipt_outcome', 'ball_recovery_recovery_failure', 'carry_end_location', 'clearance_aerial_won',
           'counterpress', 'dribble_nutmeg', 'dribble_outcome', 'dribble_overrun', 'duel_outcome', 'duel_type', 'duration',
           'foul_committed_advantage', 'foul_committed_card', 'foul_won_advantage', 'foul_won_defensive', 'goalkeeper_body_part',
           'goalkeeper_end_location', 'goalkeeper_outcome', 'goalkeeper_position', 'goalkeeper_technique', 'goalkeeper_type', 'id',
           'index', 'interception_outcome', 'location', 'match_id', 'minute', 'pass_aerial_won', 'pass_angle', 'pass_assisted_shot_id',
           'pass_backheel', 'pass_body_part', 'pass_cross', 'pass_cut_back', 'pass_deflected', 'pass_end_location', 'pass_goal_assist',
           'pass_height', 'pass_length', 'pass_outcome', 'pass_recipient', 'pass_recipient_id', 'pass_shot_assist', 'pass_switch', 'pass_type',
           'period', 'play_pattern', 'player', 'player_id', 'position', 'possession', 'possession_team', 'possession_team_id', 'related_events',
           'second', 'shot_aerial_won', 'shot_body_part', 'shot_end_location', 'shot_first_time', 'shot_freeze_frame', 'shot_key_pass_id',
           'shot_one_on_one', 'shot_outcome', 'shot_statsbomb_xg', 'shot_technique', 'shot_type', 'substitution_outcome', 'substitution_outcome_id',
           'substitution_replacement', 'substitution_replacement_id', 'tactics', 'team', 'team_id', 'timestamp', 'type', 'under_pressure',
           'competition', 'season', 'block_offensive', 'foul_committed_offensive', 'foul_committed_type', 'injury_stoppage_in_chain',
           'pass_miscommunication', 'pass_technique', 'pass_through_ball', 'block_deflection', 'ball_recovery_offensive', 'foul_committed_penalty',
           'foul_won_penalty', 'shot_open_goal', 'shot_redirect', 'bad_behaviour_card', 'block_save_block', 'shot_deflected', 'miscontrol_aerial_won',
           'clearance_body_part', 'clearance_head', 'clearance_left_foot', 'clearance_right_foot', 'goalkeeper_punched_out', 'off_camera', 'out',
           'pass_inswinging', 'pass_outswinging', 'pass_straight', 'goalkeeper_shot_saved_to_post', 'pass_no_touch', 'shot_saved_to_post',
           'goalkeeper_success_in_play', 'clearance_other', 'player_off_permanent', 'goalkeeper_shot_saved_off_target', 'shot_saved_off_target',
           'shot_follows_dribble', 'dribble_no_touch', 'goalkeeper_lost_out', 'half_start_late_video_start', 'goalkeeper_lost_in_play',
           'goalkeeper_penalty_saved_to_post', 'goalkeeper_saved_to_post', 'goalkeeper_success_out']

In [9]:
def get_export_events(df_matches,std_cols,start_i,end_i):
    events = pd.DataFrame()
    tot = len(df_matches)
    
    for i in range(start_i, end_i):
        match_id = df_matches.loc[i, 'match_id']
        competition = df_matches.loc[i, 'competition']
        season = df_matches.loc[i, 'season']
    
        current_events = sb.events(match_id=match_id)
        
        current_events = current_events.reindex(columns=std_cols, fill_value=np.nan)
        
        current_events['competition'] = competition
        current_events['season'] = season

        if events.empty:
            events = current_events
        else:
            events = pd.concat([events[std_cols], current_events[std_cols]], ignore_index=True)
        
        if i % 50 == 0:
            print(f'{i} / {tot} matches done. ({len(events)})')
    
        # Check for any missing match_id values
        if int(events.match_id.isna().sum()) > 0:
            print(f'Error at match {i} (match_id={match_id})')
    
    # Final output
    print(f'{i + 1} matches done. ({len(events)})')
    
    events.reset_index(drop=True, inplace=True)
    print(f'event columns size : {len(events.columns)}')
    print(f'events data size : {len(events)}')
    
    print(f'Null match_id values : {events["match_id"].isna().sum()}')
    events.to_csv(f'/kaggle/working/events_{start_i}_{end_i}.csv',index=False)

In [10]:
for i in range(0,2000,500):
    print(f'current section : {i},{i+500}')
    get_export_events(df_matches,std_cols,i,i+500)
get_export_events(df_matches,std_cols,2000,2450)

current section : 0,500
0 / 2450 matches done. (4108)
50 / 2450 matches done. (173093)
100 / 2450 matches done. (360384)
150 / 2450 matches done. (537693)
200 / 2450 matches done. (715847)
250 / 2450 matches done. (903050)
300 / 2450 matches done. (1092409)
350 / 2450 matches done. (1288844)
400 / 2450 matches done. (1479180)
450 / 2450 matches done. (1672181)
500 matches done. (1846984)
event columns size : 122
events data size : 1846984
Null match_id values : 0
current section : 500,1000
500 / 2450 matches done. (3671)
550 / 2450 matches done. (189367)
600 / 2450 matches done. (369635)
650 / 2450 matches done. (539986)
700 / 2450 matches done. (710792)
750 / 2450 matches done. (880957)
800 / 2450 matches done. (1055671)
850 / 2450 matches done. (1228175)
900 / 2450 matches done. (1402724)
950 / 2450 matches done. (1579690)
1000 matches done. (1749006)
event columns size : 122
events data size : 1749006
Null match_id values : 0
current section : 1000,1500
1000 / 2450 matches done. (32

In [11]:
files = ['events_500_1000','events_1000_1500','events_1500_2000','events_2000_2450']
def export_to_csv(file):
    print(f'/kaggle/working/{file}.csv')
    df = pd.read_csv(f'/kaggle/working/{file}.csv',low_memory=False)
    df.to_csv('/kaggle/working/events.csv',index=False,mode='a',header=False)

In [12]:
df = pd.read_csv('/kaggle/working/events_0_500.csv',low_memory=False)
df.to_csv('/kaggle/working/events.csv',index=False)
for file in files:
    export_to_csv(file)

/kaggle/working/events_500_1000.csv
/kaggle/working/events_1000_1500.csv
/kaggle/working/events_1500_2000.csv
/kaggle/working/events_2000_2450.csv


In [None]:
df_frames = pd.DataFrame()

for i in range(0, len(df_matches)):
    mid = df_matches.loc[i,'match_id']
    try:
        df = sb.frames(mid)
        if df_frames.empty:
            df_frames = df
        else:
            df_frames = pd.concat([df_frames,df])
    except:
        continue
    if i%50==0:
        print(i)

In [None]:
df_frames.reset_index(drop=True,
                        inplace=True)

In [None]:
df_frames.to_csv('frames.csv',index=False)