https://github.com/statsbomb/open-data dataset


In [1]:
#Necessary libraries
from mplsoccer import Sbopen
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:

parser = Sbopen() #To read JSON files

competitions = parser.competition()
competitions.head()

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
0,9,281,Germany,1. Bundesliga,male,False,False,2023/2024,2024-09-28T20:46:38.893391,2025-07-06T04:26:07.636270,2025-07-06T04:26:07.636270,2024-09-28T20:46:38.893391
1,9,27,Germany,1. Bundesliga,male,False,False,2015/2016,2024-05-19T11:11:14.192381,,,2024-05-19T11:11:14.192381
2,1267,107,Africa,African Cup of Nations,male,False,True,2023,2024-09-28T01:57:35.846538,,,2024-09-28T01:57:35.846538
3,16,4,Europe,Champions League,male,False,False,2018/2019,2025-05-08T15:10:50.835274,2021-06-13T16:17:31.694,,2025-05-08T15:10:50.835274
4,16,1,Europe,Champions League,male,False,False,2017/2018,2024-02-13T02:35:28.134882,2021-06-13T16:17:31.694,,2024-02-13T02:35:28.134882


In [3]:
df_match = parser.match(competition_id=2, season_id=27) #Season 15/16 premier league

df_match.shape

(380, 52)

In [10]:
match_ids = df_match['match_id']

In [11]:
for match_id in match_ids:
    # A. Fetch data (this is the slowest part due to internet download)
    events, _, _, _ = parser.event(match_id)
    
    # B. Filter IMMEDIATELY (Reduces memory usage)
    # We create a copy to ensure we aren't keeping the huge 'events' dataframe in memory
    match_shots = events[events['type_name'] == 'Shot'].copy()
    
    # C. Add match_id for reference
    match_shots['match_id'] = match_id
    
    # D. Append the small dataframe to our list
    shots_list.append(match_shots)

In [47]:
df_season_shots = pd.concat(shots_list, ignore_index=True)

In [48]:
df_season_shots["outcome_name"]

0        Blocked
1        Blocked
2          Off T
3          Off T
4        Wayward
          ...   
19811      Saved
19812      Saved
19813    Blocked
19814      Saved
19815      Saved
Name: outcome_name, Length: 19816, dtype: object

In [56]:
x = df_season_shots[df_season_shots["outcome_name"] == "Goal"]
x["outcome_name"]

39       Goal
49       Goal
57       Goal
64       Goal
65       Goal
         ... 
19758    Goal
19785    Goal
19795    Goal
19798    Goal
19806    Goal
Name: outcome_name, Length: 1976, dtype: object

In [57]:
filtered_shots = df_season_shots[['outcome_name','under_pressure','sub_type_name','body_part_name','shot_first_time','x','y','pass_deflected','shot_one_on_one','play_pattern_name','type_name','goalkeeper_position_name','technique_name']] #We select the usefull variables to manage our model and we omit the ones we don't think that are necessary

In [16]:
filtered_shots.head(15)

Unnamed: 0,under_pressure,sub_type_name,body_part_name,shot_first_time,x,y,pass_deflected,shot_one_on_one,play_pattern_name,type_name,goalkeeper_position_name,technique_name
0,,Open Play,Right Foot,,107.1,26.6,,,Regular Play,Shot,,Normal
1,,Open Play,Right Foot,,113.7,54.0,,,Regular Play,Shot,,Normal
2,1.0,Open Play,Left Foot,True,115.7,50.5,,,Regular Play,Shot,,Overhead Kick
3,,Open Play,Head,,113.9,38.6,,,From Throw In,Shot,,Normal
4,,Open Play,Right Foot,,96.7,47.2,,,From Free Kick,Shot,,Half Volley
5,,Open Play,Right Foot,,111.3,38.7,,True,From Free Kick,Shot,,Normal
6,,Open Play,Right Foot,,112.6,28.7,,,From Throw In,Shot,,Normal
7,1.0,Open Play,Head,,112.1,32.8,,,From Free Kick,Shot,,Normal
8,,Open Play,Right Foot,True,108.4,43.0,,,From Free Kick,Shot,,Normal
9,,Open Play,Left Foot,,114.2,27.2,,True,From Goal Kick,Shot,,Normal


In [58]:
# We save the filtered data so as to avoid the process each time

filtered_shots.to_pickle("./filtered_data.pkl")  