In [13]:
# Import necessary packages and suppress warnings

import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gzip
import json
import os
import time

from statsbombpy import sb
from mplsoccer import Pitch

warnings.filterwarnings('ignore')

In [14]:
start_time = time.time()

def concatenate_json_files(directory_path):
    concatenated_data = []

    if not os.path.exists(directory_path):
        raise FileNotFoundError("The specified directory does not exist.")

    for filename in os.listdir(directory_path):
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)

            try:
                # Read and parse the JSON data
                with open(file_path, 'r') as file:
                    json_data = json.load(file)
                    if isinstance(json_data, list):
                        concatenated_data.extend(json_data)
                    else:
                        print(f"JSON data in file {filename} is not a list.")
            except json.JSONDecodeError as e:
                print(f"Failed to decode JSON in file {filename}: {str(e)}")
                continue  # Skip this file and continue with the next one

    return concatenated_data

# Specify the directory containing the JSON files
directory_path = "/Users/lkimball/Desktop/Flatiron/Phase3_Project/open-data/data/three-sixty"

# Concatenate the JSON files
concatenated_data = concatenate_json_files(directory_path)

end_time = time.time()

# Calculate the elapsed time in seconds
elapsed_time = end_time - start_time

print(f"The cell took {elapsed_time:.6f} seconds to run.")

Failed to decode JSON in file 3835338.json: Expecting value: line 181321 column 20 (char 5193728)
Failed to decode JSON in file 3835342.json: Expecting ',' delimiter: line 171856 column 109 (char 4882432)
Failed to decode JSON in file 3845506.json: Expecting ',' delimiter: line 92794 column 3 (char 2637824)
The cell took 149.241191 seconds to run.


In [15]:
# Convert to DataFrame
df_360 = pd.DataFrame(concatenated_data)

# Print the DataFrame
df_360.head()

Unnamed: 0,event_uuid,visible_area,freeze_frame
0,75d6cc25-b03b-44e0-9c50-99a7e3c47315,"[29.574167858721, 80.0, 47.7992071074168, 0.0,...","[{'teammate': True, 'actor': False, 'keeper': ..."
1,ec457cc8-050c-4884-abbc-1e85bc3c83dc,"[29.5261908068648, 80.0, 47.3846276547738, 0.0...","[{'teammate': True, 'actor': False, 'keeper': ..."
2,246b93aa-3831-4b07-a51e-b6ba578e60d5,"[27.6350829489137, 80.0, 45.4935197968227, 0.0...","[{'teammate': True, 'actor': False, 'keeper': ..."
3,eda20fee-cab0-4094-aba3-ae286ef64004,"[13.8331181325244, 80.0, 40.2628933325614, 6.1...","[{'teammate': True, 'actor': True, 'keeper': F..."
4,e8a3f021-76da-443b-9a1d-c5857c486493,"[13.8331181325244, 80.0, 40.2628933325614, 6.1...","[{'teammate': True, 'actor': True, 'keeper': F..."


In [16]:
#read in competitions data
with open('open-data/data/competitions.json', 'r') as file:
    data = json.load(file)

In [20]:
#view data
df_comp = pd.DataFrame(data)
df_comp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   competition_id             67 non-null     int64 
 1   season_id                  67 non-null     int64 
 2   country_name               67 non-null     object
 3   competition_name           67 non-null     object
 4   competition_gender         67 non-null     object
 5   competition_youth          67 non-null     bool  
 6   competition_international  67 non-null     bool  
 7   season_name                67 non-null     object
 8   match_updated              67 non-null     object
 9   match_updated_360          51 non-null     object
 10  match_available_360        5 non-null      object
 11  match_available            67 non-null     object
dtypes: bool(2), int64(2), object(8)
memory usage: 5.5+ KB


In [26]:
# Create a new DataFrame with only non-null records in 'match_available_360'
df_comp_360 = df_comp.dropna(subset=['match_available_360'])

# Display the new DataFrame
df_comp_360

#dropping female competitions
df_comp_360 = df_comp_360[df_comp_360['competition_gender'] != 'female']
df_comp_360

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
26,43,106,International,FIFA World Cup,male,False,True,2022,2023-08-12T16:44:27.619465,2023-08-17T15:55:15.164685,2023-08-17T15:55:15.164685,2023-08-12T16:44:27.619465
35,11,90,Spain,La Liga,male,False,False,2020/2021,2023-07-26T14:11:01.312143,2023-07-26T14:15:15.217027,2023-07-26T14:15:15.217027,2023-07-26T14:11:01.312143
62,55,43,Europe,UEFA Euro,male,False,True,2020,2023-02-24T21:26:47.128979,2023-04-27T22:38:34.970148,2023-04-27T22:38:34.970148,2023-02-24T21:26:47.128979


In [44]:
df_2022WC = sb.matches(competition_id=43, season_id=106)
df_2022WC.head()

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,...,last_updated_360,match_week,competition_stage,stadium,referee,home_managers,away_managers,data_version,shot_fidelity_version,xy_fidelity_version
0,3857256,2022-12-02,21:00:00.000,International - FIFA World Cup,2022,Serbia,Switzerland,2,3,available,...,2023-04-26T23:49:58.956186,3,Group Stage,Stadium 974,Fernando Andrés Rapallini,Dragan Stojković,Murat Yakin,1.1.0,2,2
1,3869151,2022-12-03,21:00:00.000,International - FIFA World Cup,2022,Argentina,Australia,2,1,available,...,2023-07-30T07:48:51.865595,4,Round of 16,Ahmad bin Ali Stadium,Szymon Marciniak,Lionel Sebastián Scaloni,Graham James Arnold,1.1.0,2,2
2,3857257,2022-11-30,17:00:00.000,International - FIFA World Cup,2022,Australia,Denmark,1,0,available,...,2023-06-20T11:04:37.638969,3,Group Stage,Al Janoub Stadium,Mustapha Ghorbal,Graham James Arnold,Kasper Hjulmand,1.1.0,2,2
3,3857258,2022-11-24,21:00:00.000,International - FIFA World Cup,2022,Brazil,Serbia,2,0,available,...,2023-07-11T14:56:31.096588,1,Group Stage,Lusail Stadium,Alireza Faghani,Telê Santana da Silva,Dragan Stojković,1.1.0,2,2
4,3857288,2022-11-26,12:00:00.000,International - FIFA World Cup,2022,Tunisia,Australia,0,1,available,...,2023-04-27T00:30:07.835815,2,Group Stage,Al Janoub Stadium,Daniel Siebert,Jalel Kadri,Graham James Arnold,1.1.0,2,2


In [89]:
#Getting the event data for all matches in the 2022 WC
start_time = time.time()


# Assuming df_2022WC is your DataFrame containing match_id column
match_ids = df_2022WC['match_id'].tolist()

# Create an empty list to store DataFrames for each match
all_events = []

# Iterate through match IDs and retrieve events
for match_id in match_ids:
    events_df = sb.events(match_id=match_id)
    all_events.append(events_df)

# Concatenate all DataFrames into one
combined_events_df = pd.concat(all_events, ignore_index=True)


end_time = time.time()

# Calculate the elapsed time in seconds
elapsed_time = end_time - start_time

print(f"The cell took {elapsed_time:.6f} seconds to run.")

The cell took 61.160284 seconds to run.


In [90]:
combined_events_df.columns

Index(['bad_behaviour_card', 'ball_receipt_outcome',
       'ball_recovery_recovery_failure', 'block_deflection', 'block_offensive',
       'carry_end_location', 'clearance_aerial_won', 'clearance_body_part',
       'clearance_head', 'clearance_left_foot',
       ...
       'shot_follows_dribble', 'block_save_block',
       'goalkeeper_shot_saved_to_post', 'shot_saved_to_post',
       'half_start_late_video_start', 'goalkeeper_shot_saved_off_target',
       'shot_saved_off_target', 'goalkeeper_success_in_play', 'shot_redirect',
       'goalkeeper_lost_in_play'],
      dtype='object', length=110)

In [92]:
combined_events_df['type'].value_counts()

Pass                 68515
Ball Receipt*        63715
Carry                53764
Pressure             16553
Ball Recovery         5821
Duel                  4389
Clearance             2684
Block                 2386
Dribble               1793
Goal Keeper           1790
Foul Committed        1775
Miscontrol            1755
Foul Won              1693
Shot                  1494
Dispossessed          1431
Interception          1371
Dribbled Past         1036
Substitution           587
Injury Stoppage        403
Half Start             286
Half End               286
Tactical Shift         243
50/50                  236
Referee Ball-Drop      162
Starting XI            128
Shield                 104
Player Off              74
Player On               74
Bad Behaviour           44
Error                   28
Offside                 26
Own Goal Against         3
Own Goal For             3
Name: type, dtype: int64

In [94]:
#isolating shots
combined_events_df = combined_events_df[combined_events_df['type'] == 'Shot']
combined_events_df.head()

Unnamed: 0,bad_behaviour_card,ball_receipt_outcome,ball_recovery_recovery_failure,block_deflection,block_offensive,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,clearance_left_foot,...,shot_follows_dribble,block_save_block,goalkeeper_shot_saved_to_post,shot_saved_to_post,half_start_late_video_start,goalkeeper_shot_saved_off_target,shot_saved_off_target,goalkeeper_success_in_play,shot_redirect,goalkeeper_lost_in_play
2674,,,,,,,,,,,...,,,,,,,,,,
2675,,,,,,,,,,,...,,,,,,,,,,
2676,,,,,,,,,,,...,,,,,,,,,,
2677,,,,,,,,,,,...,,,,,,,,,,
2678,,,,,,,,,,,...,,,,,,,,,,


In [95]:
#make list of unwanted columns
columns_to_drop = ['50_50', 'bad_behaviour_card', 'ball_receipt_outcome',
       'ball_recovery_offensive', 'ball_recovery_recovery_failure',
       'block_deflection', 'block_offensive', 'carry_end_location',
       'clearance_aerial_won', 'clearance_body_part', 'clearance_head',
       'clearance_left_foot', 'clearance_other', 'clearance_right_foot',
       'counterpress', 'dribble_nutmeg', 'dribble_outcome', 'dribble_overrun',
       'duel_outcome', 'duel_type', 'foul_committed_advantage',
       'foul_committed_card', 'foul_committed_offensive',
       'foul_committed_penalty', 'foul_committed_type', 'foul_won_advantage',
       'foul_won_defensive', 'foul_won_penalty', 'goalkeeper_body_part',
       'goalkeeper_end_location', 'goalkeeper_outcome', 'goalkeeper_position',
       'goalkeeper_technique', 'goalkeeper_type',
       'interception_outcome', 'off_camera', 'pass_aerial_won', 'pass_angle', 'pass_assisted_shot_id',
       'pass_body_part', 'pass_cross', 'pass_deflected', 'pass_end_location',
       'pass_goal_assist', 'pass_height', 'pass_inswinging', 'pass_length',
       'pass_outcome', 'pass_outswinging', 'pass_recipient',
       'pass_shot_assist', 'pass_switch', 'pass_technique',
       'pass_through_ball', 'pass_type', 'possession',
       'possession_team', 'possession_team_id', 'related_events', 'second',
       'substitution_outcome', 'substitution_replacement', 'tactics','under_pressure','period', 'visible_area', 'timestamp',
        'shot_key_pass_id','shot_aerial_won','position', 'out',  'team', 'player', 'injury_stoppage_in_chain','pass_miscommunication',
        'miscontrol_aerial_won', 'pass_no_touch', 'pass_straight',
       'dribble_no_touch', 'goalkeeper_punched_out','block_save_block', 'goalkeeper_shot_saved_to_post',
       'shot_saved_to_post', 'half_start_late_video_start',
       'goalkeeper_shot_saved_off_target', 'duration', 'goalkeeper_lost_in_play', 'goalkeeper_success_in_play','shot_redirect', 'type' ]

In [96]:
#dropping unwanted columns
columns_to_drop_existing = [col for col in columns_to_drop if col in combined_events_df.columns]
combined_events_df.drop(columns=columns_to_drop_existing, inplace=True, errors='ignore')

In [97]:
#sanity check on remaining columns
combined_events_df.columns

Index(['id', 'index', 'location', 'match_id', 'minute', 'pass_cut_back',
       'play_pattern', 'player_id', 'shot_body_part', 'shot_deflected',
       'shot_end_location', 'shot_first_time', 'shot_freeze_frame',
       'shot_one_on_one', 'shot_open_goal', 'shot_outcome',
       'shot_statsbomb_xg', 'shot_technique', 'shot_type',
       'shot_follows_dribble', 'shot_saved_off_target'],
      dtype='object')

In [100]:
#labeling as WC specific data
WCevents = combined_events_df

In [101]:
df_comp_360

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
26,43,106,International,FIFA World Cup,male,False,True,2022,2023-08-12T16:44:27.619465,2023-08-17T15:55:15.164685,2023-08-17T15:55:15.164685,2023-08-12T16:44:27.619465
35,11,90,Spain,La Liga,male,False,False,2020/2021,2023-07-26T14:11:01.312143,2023-07-26T14:15:15.217027,2023-07-26T14:15:15.217027,2023-07-26T14:11:01.312143
62,55,43,Europe,UEFA Euro,male,False,True,2020,2023-02-24T21:26:47.128979,2023-04-27T22:38:34.970148,2023-04-27T22:38:34.970148,2023-02-24T21:26:47.128979


In [102]:
#isolating matches for La liga 2020/2021
df_LL = sb.matches(competition_id=11, season_id=90)
df_LL.head()

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,...,last_updated_360,match_week,competition_stage,stadium,referee,home_managers,away_managers,data_version,shot_fidelity_version,xy_fidelity_version
0,3773386,2020-10-31,21:00:00.000,Spain - La Liga,2020/2021,Deportivo Alavés,Barcelona,1,1,available,...,2023-07-25T04:25:41.348202,8,Regular Season,Estadio de Mendizorroza,,Pablo Javier Machín Díez,Ronald Koeman,1.1.0,2,2
1,3773565,2021-01-09,18:30:00.000,Spain - La Liga,2020/2021,Granada,Barcelona,0,4,available,...,2023-07-25T04:30:16.058384,18,Regular Season,Estadio Nuevo Los Cármenes,Ricardo De Burgos Bengoetxea,Diego Martínez Penas,Ronald Koeman,1.1.0,2,2
2,3773457,2021-05-16,18:30:00.000,Spain - La Liga,2020/2021,Barcelona,Celta Vigo,1,2,available,...,2023-04-27T23:03:53.506485,37,Regular Season,Spotify Camp Nou,,Ronald Koeman,Eduardo Germán Coudet,1.1.0,2,2
3,3773631,2021-02-07,21:00:00.000,Spain - La Liga,2020/2021,Real Betis,Barcelona,2,3,available,...,2023-07-25T03:56:34.733180,22,Regular Season,Estadio Benito Villamarín,,Manuel Luis Pellegrini Ripamonti,Ronald Koeman,1.1.0,2,2
4,3773665,2021-03-06,21:00:00.000,Spain - La Liga,2020/2021,Osasuna,Barcelona,0,2,available,...,2023-04-28T02:57:03.412841,26,Regular Season,Estadio El Sadar,Guillermo Cuadra Fernández,Jagoba Arrasate Elustondo,Ronald Koeman,1.1.0,2,2


In [103]:
##Getting the event data for all matches in the 2020/2021 la liga season
start_time = time.time()


# Assuming df_2022WC is your DataFrame containing match_id column
match_ids = df_LL['match_id'].tolist()

# Create an empty list to store DataFrames for each match
all_events = []

# Iterate through match IDs and retrieve events
for match_id in match_ids:
    events_df = sb.events(match_id=match_id)
    all_events.append(events_df)

# Concatenate all DataFrames into one
combined_events_df = pd.concat(all_events, ignore_index=True)


end_time = time.time()

# Calculate the elapsed time in seconds
elapsed_time = end_time - start_time

print(f"The cell took {elapsed_time:.6f} seconds to run.")

The cell took 48.553056 seconds to run.


In [106]:
#isolating shots
combined_events_df = combined_events_df[combined_events_df['type'] == 'Shot']
combined_events_df

Unnamed: 0,50_50,bad_behaviour_card,ball_receipt_outcome,ball_recovery_offensive,ball_recovery_recovery_failure,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,clearance_left_foot,...,foul_won_penalty,goalkeeper_punched_out,goalkeeper_shot_saved_off_target,goalkeeper_shot_saved_to_post,shot_saved_off_target,shot_saved_to_post,block_save_block,dribble_no_touch,shot_redirect,shot_follows_dribble
3805,,,,,,,,,,,...,,,,,,,,,,
3806,,,,,,,,,,,...,,,,,,,,,,
3807,,,,,,,,,,,...,,,,,,,,,,
3808,,,,,,,,,,,...,,,,,,,,,,
3809,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138950,,,,,,,,,,,...,,,,,,,,,,
138951,,,,,,,,,,,...,,,,,,,,,,
138952,,,,,,,,,,,...,,,,,,,,,,
138953,,,,,,,,,,,...,,,,,,,,,,


In [107]:
#dropping unwanted columns
columns_to_drop_existing = [col for col in columns_to_drop if col in combined_events_df.columns]
combined_events_df.drop(columns=columns_to_drop_existing, inplace=True, errors='ignore')

In [108]:
LaLiga_events = combined_events_df

In [111]:
LaLiga_events.head()

Unnamed: 0,id,index,location,match_id,minute,pass_cut_back,play_pattern,player_id,shot_body_part,shot_end_location,...,shot_freeze_frame,shot_one_on_one,shot_open_goal,shot_outcome,shot_statsbomb_xg,shot_technique,shot_type,shot_deflected,shot_saved_off_target,shot_follows_dribble
3805,c5341577-e1ca-4742-98fb-dc745cbbe103,575,"[108.6, 28.0]",3773386,12,,From Throw In,30756.0,Right Foot,"[120.0, 47.8, 0.0]",...,"[{'location': [91.6, 30.4], 'player': {'id': 5...",True,,Off T,0.200969,Normal,Open Play,,,
3806,1aedaf9e-bc12-4d0a-953d-bd0f7db3688a,681,"[103.6, 51.0]",3773386,16,,Regular Play,26387.0,Right Foot,"[115.8, 42.1, 0.0]",...,"[{'location': [97.2, 35.3], 'player': {'id': 2...",,,Saved,0.096384,Normal,Open Play,,,
3807,96b28bfc-d174-4b38-86cf-5a43cda4a14f,901,"[104.3, 33.9]",3773386,19,,From Throw In,5487.0,Left Foot,"[120.0, 50.6, 0.0]",...,"[{'location': [104.3, 36.7], 'player': {'id': ...",,,Off T,0.098879,Normal,Open Play,,,
3808,b9ca5464-1f5a-401b-a31d-8101bd61072a,929,"[97.9, 44.3]",3773386,22,,From Free Kick,5503.0,Left Foot,"[119.8, 37.6]",...,"[{'location': [114.1, 43.8], 'player': {'id': ...",,,Blocked,0.078938,Normal,Free Kick,,,
3809,75bdc651-c041-4021-b201-cb9eb8b97837,1282,"[118.3, 42.1]",3773386,30,,Regular Play,24049.0,Left Foot,"[120.0, 40.7, 0.0]",...,"[{'location': [108.6, 55.2], 'player': {'id': ...",True,True,Goal,0.976192,Normal,Open Play,,,


In [112]:
#isolating matches for Euro 2020
df_Euro = sb.matches(competition_id=55, season_id=43)
df_Euro.head()

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,...,last_updated_360,match_week,competition_stage,stadium,referee,home_managers,away_managers,data_version,shot_fidelity_version,xy_fidelity_version
0,3795108,2021-07-02,18:00:00.000,Europe - UEFA Euro,2020,Switzerland,Spain,1,1,available,...,2023-04-27T21:55:08.695971,5,Quarter-finals,Saint-Petersburg Stadium,Michael Oliver,Vladimir Petković,Luis Enrique Martínez García,1.1.0,2,2
1,3788769,2021-06-21,21:00:00.000,Europe - UEFA Euro,2020,Russia,Denmark,1,4,available,...,2023-04-27T22:20:21.686564,3,Group Stage,Parken,Clément Turpin,Stanislav Cherchesov,Kasper Hjulmand,1.1.0,2,2
2,3788766,2021-06-20,18:00:00.000,Europe - UEFA Euro,2020,Italy,Wales,1,0,available,...,2023-04-27T22:22:42.769375,3,Group Stage,Estadio Olímpico,Ovidiu Alin Hațegan,Roberto Mancini,Robert Page,1.1.0,2,2
3,3795220,2021-07-06,21:00:00.000,Europe - UEFA Euro,2020,Italy,Spain,1,1,available,...,2022-08-04T12:00,6,Semi-finals,Wembley Stadium,Felix Brych,Roberto Mancini,Luis Enrique Martínez García,1.1.0,2,2
4,3788761,2021-06-18,15:00:00.000,Europe - UEFA Euro,2020,Sweden,Slovakia,1,0,available,...,2023-04-27T22:28:52.986485,2,Group Stage,Saint-Petersburg Stadium,Daniel Siebert,Jan Olof Andersson,Štefan Tarkovič,1.1.0,2,2


In [113]:
##Getting the event data for all matches in the 2020/2021 la liga season
start_time = time.time()


# Assuming df_2022WC is your DataFrame containing match_id column
match_ids = df_Euro['match_id'].tolist()

# Create an empty list to store DataFrames for each match
all_events = []

# Iterate through match IDs and retrieve events
for match_id in match_ids:
    events_df = sb.events(match_id=match_id)
    all_events.append(events_df)

# Concatenate all DataFrames into one
combined_events_df = pd.concat(all_events, ignore_index=True)


end_time = time.time()

# Calculate the elapsed time in seconds
elapsed_time = end_time - start_time

print(f"The cell took {elapsed_time:.6f} seconds to run.")

The cell took 69.855205 seconds to run.


In [114]:
#isolating shots
combined_events_df = combined_events_df[combined_events_df['type'] == 'Shot']
combined_events_df

Unnamed: 0,50_50,ball_receipt_outcome,ball_recovery_offensive,ball_recovery_recovery_failure,block_deflection,block_offensive,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,...,goalkeeper_shot_saved_off_target,goalkeeper_shot_saved_to_post,shot_saved_off_target,shot_saved_to_post,goalkeeper_lost_in_play,goalkeeper_success_in_play,dribble_no_touch,goalkeeper_penalty_saved_to_post,shot_follows_dribble,player_off_permanent
4670,,,,,,,,,,,...,,,,,,,,,,
4671,,,,,,,,,,,...,,,,,,,,,,
4672,,,,,,,,,,,...,,,,,,,,,,
4673,,,,,,,,,,,...,,,,,,,,,,
4674,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192609,,,,,,,,,,,...,,,,,,,,,,
192610,,,,,,,,,,,...,,,,,,,,,,
192611,,,,,,,,,,,...,,,,,,,,,,
192612,,,,,,,,,,,...,,,,,,,,,,


In [115]:
#dropping unwanted columns
columns_to_drop_existing = [col for col in columns_to_drop if col in combined_events_df.columns]
combined_events_df.drop(columns=columns_to_drop_existing, inplace=True, errors='ignore')

In [116]:
Euro_events = combined_events_df

In [120]:
shots_df = pd.concat([WCevents, LaLiga_events, Euro_events], ignore_index=True)
shots_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3622 entries, 0 to 3621
Data columns (total 23 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                3622 non-null   object 
 1   index                             3622 non-null   int64  
 2   location                          3622 non-null   object 
 3   match_id                          3622 non-null   int64  
 4   minute                            3622 non-null   int64  
 5   pass_cut_back                     0 non-null      object 
 6   play_pattern                      3622 non-null   object 
 7   player_id                         3622 non-null   float64
 8   shot_body_part                    3622 non-null   object 
 9   shot_deflected                    51 non-null     object 
 10  shot_end_location                 3622 non-null   object 
 11  shot_first_time                   1126 non-null   object 
 12  shot_f

In [121]:
columns_to_drop2 = ['player_off_permanent', 'goalkeeper_penalty_saved_to_post', 'shot_saved_off_target',
                    'shot_follows_dribble', 'pass_cut_back']

#dropping unwanted columns
columns_to_drop_existing2 = [col for col in columns_to_drop2 if col in shots_df.columns]
shots_df.drop(columns=columns_to_drop_existing2, inplace=True, errors='ignore')

In [122]:
shots_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3622 entries, 0 to 3621
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 3622 non-null   object 
 1   index              3622 non-null   int64  
 2   location           3622 non-null   object 
 3   match_id           3622 non-null   int64  
 4   minute             3622 non-null   int64  
 5   play_pattern       3622 non-null   object 
 6   player_id          3622 non-null   float64
 7   shot_body_part     3622 non-null   object 
 8   shot_deflected     51 non-null     object 
 9   shot_end_location  3622 non-null   object 
 10  shot_first_time    1126 non-null   object 
 11  shot_freeze_frame  3512 non-null   object 
 12  shot_one_on_one    200 non-null    object 
 13  shot_open_goal     43 non-null     object 
 14  shot_outcome       3622 non-null   object 
 15  shot_statsbomb_xg  3622 non-null   float64
 16  shot_technique     3622 