https://github.com/statsbomb/open-data dataset


In [1]:
#Necessary libraries
from mplsoccer import Sbopen
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:

parser = Sbopen() #To read JSON files

competitions = parser.competition()
competitions.head()

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
0,9,281,Germany,1. Bundesliga,male,False,False,2023/2024,2024-09-28T20:46:38.893391,2025-07-06T04:26:07.636270,2025-07-06T04:26:07.636270,2024-09-28T20:46:38.893391
1,9,27,Germany,1. Bundesliga,male,False,False,2015/2016,2024-05-19T11:11:14.192381,,,2024-05-19T11:11:14.192381
2,1267,107,Africa,African Cup of Nations,male,False,True,2023,2024-09-28T01:57:35.846538,,,2024-09-28T01:57:35.846538
3,16,4,Europe,Champions League,male,False,False,2018/2019,2025-05-08T15:10:50.835274,2021-06-13T16:17:31.694,,2025-05-08T15:10:50.835274
4,16,1,Europe,Champions League,male,False,False,2017/2018,2024-02-13T02:35:28.134882,2021-06-13T16:17:31.694,,2024-02-13T02:35:28.134882


In [3]:
df_match = parser.match(competition_id=2, season_id=27) #Season 15/16 premier league

df_match.shape

(380, 52)

In [10]:
match_ids = df_match['match_id']

In [11]:
for match_id in match_ids:
    # A. Fetch data (this is the slowest part due to internet download)
    events, _, _, _ = parser.event(match_id)
    
    # B. Filter IMMEDIATELY (Reduces memory usage)
    # We create a copy to ensure we aren't keeping the huge 'events' dataframe in memory
    match_shots = events[events['type_name'] == 'Shot'].copy()
    
    # C. Add match_id for reference
    match_shots['match_id'] = match_id
    
    # D. Append the small dataframe to our list
    shots_list.append(match_shots)

In [12]:
df_season_shots = pd.concat(shots_list, ignore_index=True)

In [None]:
df_season_shots

Unnamed: 0,id,index,period,timestamp,minute,second,possession,duration,match_id,type_id,...,pass_no_touch,pass_miscommunication,dribble_no_touch,block_deflection,shot_redirect,block_save_block,shot_deflected,player_off_permanent,shot_follows_dribble,half_start_late_video_start
0,72596ffe-393e-4a75-82c6-5fb82ae36d4d,93,1,00:01:33.287000,1,33,4,0.271905,3754058,16,...,,,,,,,,,,
1,b2bae775-5dd8-45bf-9b84-191c7849f707,224,1,00:04:23.399000,4,23,9,0.236075,3754058,16,...,,,,,,,,,,
2,7c4227e5-0759-4633-93ea-bbbe291bbd72,440,1,00:09:20.642000,9,20,16,0.770062,3754058,16,...,,,,,,,,,,
3,8e3a6f10-64e4-49bf-a157-16b15b552713,579,1,00:13:11.270000,13,11,20,0.884091,3754058,16,...,,,,,,,,,,
4,3a5364b3-0b99-4271-a451-0a3033ad195d,1170,1,00:25:47.089000,25,47,47,0.826620,3754058,16,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19811,43694fcd-dc3f-4351-be75-c50edcf8159e,2604,2,00:27:28.723000,72,28,140,0.578476,3754078,16,...,,,,,,,,,,
19812,dd0324cf-8325-4201-9c46-c687bebbb716,2683,2,00:29:15.503000,74,15,146,0.216054,3754078,16,...,,,,,,,,,,
19813,4a2ced59-d48c-477d-9481-45e50f28781d,2704,2,00:30:06.455000,75,6,147,0.310971,3754078,16,...,,,,,,,,,,
19814,554d91bc-74fa-4a12-a186-5f66a9a59fd9,2864,2,00:35:12.328000,80,12,157,0.823980,3754078,16,...,,,,,,,,,,


In [15]:
filtered_shots = df_season_shots[['under_pressure','sub_type_name','body_part_name','shot_first_time','x','y','pass_deflected','shot_one_on_one','play_pattern_name','type_name','goalkeeper_position_name','technique_name']] #We select the usefull variables to manage our model and we omit the ones we don't think that are necessary

In [16]:
filtered_shots.head(15)

Unnamed: 0,under_pressure,sub_type_name,body_part_name,shot_first_time,x,y,pass_deflected,shot_one_on_one,play_pattern_name,type_name,goalkeeper_position_name,technique_name
0,,Open Play,Right Foot,,107.1,26.6,,,Regular Play,Shot,,Normal
1,,Open Play,Right Foot,,113.7,54.0,,,Regular Play,Shot,,Normal
2,1.0,Open Play,Left Foot,True,115.7,50.5,,,Regular Play,Shot,,Overhead Kick
3,,Open Play,Head,,113.9,38.6,,,From Throw In,Shot,,Normal
4,,Open Play,Right Foot,,96.7,47.2,,,From Free Kick,Shot,,Half Volley
5,,Open Play,Right Foot,,111.3,38.7,,True,From Free Kick,Shot,,Normal
6,,Open Play,Right Foot,,112.6,28.7,,,From Throw In,Shot,,Normal
7,1.0,Open Play,Head,,112.1,32.8,,,From Free Kick,Shot,,Normal
8,,Open Play,Right Foot,True,108.4,43.0,,,From Free Kick,Shot,,Normal
9,,Open Play,Left Foot,,114.2,27.2,,True,From Goal Kick,Shot,,Normal


In [None]:
def distance(x,y):
  return np.sqrt((120-x)**2 + (40-y)**2)

filtered_shots['distance_to_goal'] = distance(filtered_shots['x'],filtered_shots['y'])

filtered_shots.head()

In [None]:
goal_width = 7.32 #in meters
gpostx1,gposty1 = (120,40 - goal_width/2)
gpostx2,gposty2 = (120,40 + goal_width/2)
xshooter = filtered_shots['x']
yshooter = filtered_shots['y']
#calculate angles to far and near posts from shooter position
angle_post1 = np.arctan2(yshooter-gposty1, xshooter - gpostx1)
angle_post2 = np.arctan2(yshooter - gposty2, xshooter - gpostx2)
shooting_angle_rad = np.abs(angle_post2 - angle_post1)

filtered_shots['angle_to_goal'] = np.degrees(shooting_angle_rad)
filtered_shots.head()

In [13]:
# We save the filtered data so as to avoid the process each time


filtered_shots.to_pickle("./filtered_data.pkl")  