In [2]:
from statsbombpy import sb
import pandas as pd
import warnings 
warnings.filterwarnings("ignore")

comps = sb.competitions()

# Filter to your competitions of interest
priority_names = [
    "Premier League", "La Liga", "Bundesliga", "Serie A", "Ligue 1",
    "World Cup", "European Championship"
]
priority_comps = comps[
    (comps["competition_gender"] == "male") &
    (comps["competition_name"].isin(priority_names))
].copy()

print(priority_comps[["competition_id", "competition_name", "season_id", "season_name"]])

    competition_id competition_name  season_id season_name
38              11          La Liga         90   2020/2021
39              11          La Liga         42   2019/2020
40              11          La Liga          4   2018/2019
41              11          La Liga          1   2017/2018
42              11          La Liga          2   2016/2017
43              11          La Liga         27   2015/2016
44              11          La Liga         26   2014/2015
45              11          La Liga         25   2013/2014
46              11          La Liga         24   2012/2013
47              11          La Liga         23   2011/2012
48              11          La Liga         22   2010/2011
49              11          La Liga         21   2009/2010
50              11          La Liga         41   2008/2009
51              11          La Liga         40   2007/2008
52              11          La Liga         39   2006/2007
53              11          La Liga         38   2005/20

In [11]:
all_good = True
missing = []

for _, row in priority_comps.iterrows():
    comp_id = row["competition_id"]
    season_id = row["season_id"]
    matches = sb.matches(competition_id=comp_id, season_id=season_id)
    print(f"Checking for Competition: {comp_id}, Season: {season_id}")
    print(row["competition_name"], row["season_name"])
    for match_id in matches["match_id"]:
        try:
            events = sb.events(match_id=match_id)
            if events.empty:
                all_good = False
                missing.append((row["competition_name"], row["season_name"], match_id))
        except Exception:
            all_good = False
            missing.append((row["competition_name"], row["season_name"], match_id))

if all_good:
    print("✅ All matches have event data")
else:
    print("⚠️ Some matches missing events:")
    print(pd.DataFrame(missing, columns=["Competition","Season","Match ID"]))

Checking for Competition: 11, Season: 90
La Liga 2020/2021
Checking for Competition: 11, Season: 42
La Liga 2019/2020
Checking for Competition: 11, Season: 4
La Liga 2018/2019
Checking for Competition: 11, Season: 1
La Liga 2017/2018
Checking for Competition: 11, Season: 2
La Liga 2016/2017
Checking for Competition: 11, Season: 27
La Liga 2015/2016
Checking for Competition: 11, Season: 26
La Liga 2014/2015
Checking for Competition: 11, Season: 25
La Liga 2013/2014
Checking for Competition: 11, Season: 24
La Liga 2012/2013
Checking for Competition: 11, Season: 23
La Liga 2011/2012
Checking for Competition: 11, Season: 22
La Liga 2010/2011
Checking for Competition: 11, Season: 21
La Liga 2009/2010
Checking for Competition: 11, Season: 41
La Liga 2008/2009
Checking for Competition: 11, Season: 40
La Liga 2007/2008
Checking for Competition: 11, Season: 39
La Liga 2006/2007
Checking for Competition: 11, Season: 38
La Liga 2005/2006
Checking for Competition: 11, Season: 37
La Liga 2004/2005


In [2]:
import pandas as pd

# Take one match
events = sb.events(match_id=3773386)
df = pd.DataFrame(events)

# Look at passes per period
for period in df['period'].unique():
    sample = df[(df['type'] == 'Pass') & (df['period'] == period)].head(5)
    print(f"\n--- Period {period} ---")
    print(sample[['team', 'location', 'pass_end_location']])


--- Period 1 ---
         team      location pass_end_location
6   Barcelona  [61.0, 40.1]      [50.0, 45.4]
7   Barcelona  [49.4, 44.9]      [47.1, 29.6]
8   Barcelona  [48.1, 29.6]      [35.5, 50.6]
9   Barcelona  [44.3, 50.2]      [41.0, 29.7]
10  Barcelona  [49.0, 28.7]      [57.8, 33.5]

--- Period 2 ---
                 team      location pass_end_location
529  Deportivo Alavés  [61.0, 40.1]      [35.4, 40.8]
530  Deportivo Alavés  [36.9, 40.4]       [78.2, 5.7]
531  Deportivo Alavés   [77.3, 5.7]      [90.6, 38.3]
532         Barcelona  [29.3, 42.6]      [27.7, 52.6]
533         Barcelona  [27.7, 53.0]       [5.6, 39.8]


In [5]:
df.to_csv("events_sample.csv")

In [3]:
# test_statsbomb.py
from statsbombpy import sb

try:
    # Test loading matches
    matches = sb.matches(11, 90)  # Premier League 2020/21
    print(f"✅ Loaded {len(matches)} matches")
    
    # Test loading events for first match
    if len(matches) > 0:
        first_match_id = matches.iloc[0]['match_id']
        print(f"Testing events for match {first_match_id}")
        
        events = sb.events(match_id=first_match_id)
        print(f"✅ Loaded {len(events)} events")
        print(f"Event columns: {list(events.columns)}")
        
        # Check pass events
        pass_events = events[events['type'] == 'Pass']
        print(f"✅ Found {len(pass_events)} pass events")
        
        # Check first pass event structure
        if len(pass_events) > 0:
            first_pass = pass_events.iloc[0]
            print(f"First pass event keys: {list(first_pass.index)}")
            print(f"Location: {first_pass.get('location')}")
            print(f"Pass end location: {first_pass.get('pass_end_location')}")
    
except Exception as e:
    print(f"❌ StatsBomb API Error: {e}")
    print("Try using free data or check your credentials")


✅ Loaded 35 matches
Testing events for match 3773386
✅ Loaded 3891 events
Event columns: ['50_50', 'bad_behaviour_card', 'ball_receipt_outcome', 'ball_recovery_offensive', 'ball_recovery_recovery_failure', 'carry_end_location', 'clearance_aerial_won', 'clearance_body_part', 'clearance_head', 'clearance_left_foot', 'clearance_right_foot', 'counterpress', 'dribble_nutmeg', 'dribble_outcome', 'dribble_overrun', 'duel_outcome', 'duel_type', 'duration', 'foul_committed_advantage', 'foul_committed_card', 'foul_committed_offensive', 'foul_committed_type', 'foul_won_advantage', 'foul_won_defensive', 'goalkeeper_body_part', 'goalkeeper_end_location', 'goalkeeper_outcome', 'goalkeeper_position', 'goalkeeper_technique', 'goalkeeper_type', 'id', 'index', 'interception_outcome', 'location', 'match_id', 'minute', 'miscontrol_aerial_won', 'off_camera', 'out', 'pass_aerial_won', 'pass_angle', 'pass_assisted_shot_id', 'pass_body_part', 'pass_cross', 'pass_cut_back', 'pass_deflected', 'pass_end_location

In [4]:
import json
import zipfile
import os
from collections import Counter

print(os.getcwd())

# ================================
# LOAD DATA
# ================================

print("Loading data...")

# Load simple JSON files
data = {}

# Try to load each file
files_to_load = ['coaches.json', 'competitions.json', 'players.json', 'referees.json', 'teams.json']

for file in files_to_load:
    try:
        with open(file, 'r') as f:
            data[file.replace('.json', '')] = json.load(f)
        print(f"✅ {file}: {len(data[file.replace('.json', '')])} records")
    except Exception as e:
        print(f"❌ Error loading {file}: {e}")

# Load matches from the matches folder
matches = []
match_files = [
    'matches_England.json',
    'matches_European_Championship.json', 
    'matches_France.json',
    'matches_Germany.json',
    'matches_Italy.json',
    'matches_Spain.json',
    'matches_World_Cup.json'
]

for file in match_files:
    try:
        with open(os.path.join(os.getcwd(), 'matches', file), 'r') as f:
            match_data = json.load(f)
            matches.extend(match_data)
        print(f"✅ {file}: {len(match_data)} matches")
    except Exception as e:
        print(f"❌ Error loading {file}: {e}")

data['matches'] = matches
print(f"Total matches: {len(matches)}")

# Load events from zip (just a sample for now)
try:
    with zipfile.ZipFile(os.path.join(os.getcwd(), 'events.zip'), 'r') as zip_file:
        # Just load one events file as sample
        first_events_file = [f for f in zip_file.namelist() if f.endswith('.json')][0]
        with zip_file.open(first_events_file) as f:
            sample_events = json.load(f)
        print(f"✅ Sample events from {first_events_file}: {len(sample_events)} events")
        data['events'] = sample_events[:1000]  # Just first 1000 for exploration
except Exception as e:
    print(f"❌ Error loading events: {e}")
    data['events'] = []

print(f"\nLoaded datasets: {list(data.keys())}")

/workspaces/masters-thesis/data/dataset_1
Loading data...
✅ coaches.json: 208 records
✅ competitions.json: 7 records
✅ players.json: 3603 records
❌ Error loading referees.json: Expecting ':' delimiter: line 1 column 196710 (char 196709)
✅ teams.json: 142 records
✅ matches_England.json: 380 matches
✅ matches_European_Championship.json: 51 matches
✅ matches_France.json: 380 matches
✅ matches_Germany.json: 306 matches
✅ matches_Italy.json: 380 matches
✅ matches_Spain.json: 380 matches
✅ matches_World_Cup.json: 64 matches
Total matches: 1941
✅ Sample events from events_England.json: 643150 events

Loaded datasets: ['coaches', 'competitions', 'players', 'teams', 'matches', 'events']


In [5]:
# ================================
# BASIC DATA EXPLORATION
# ================================

print("\n" + "="*50)
print("BASIC DATA OVERVIEW")
print("="*50)

for name, dataset in data.items():
    print(f"\n{name.upper()}: {len(dataset)} records")
    
    if len(dataset) > 0:
        print("Sample record:")
        sample = dataset[0]
        for key, value in sample.items():
            if isinstance(value, dict):
                print(f"  {key}: {type(value)} with keys {list(value.keys())}")
            elif isinstance(value, list):
                print(f"  {key}: list with {len(value)} items")
            else:
                print(f"  {key}: {value}")


BASIC DATA OVERVIEW

COACHES: 208 records
Sample record:
  wyId: 275580
  shortName: Cesar Domingo
  firstName: César Domingo
  middleName: 
  lastName: Mendiondo López
  birthDate: None
  birthArea: <class 'dict'> with keys ['id', 'alpha2code', 'alpha3code', 'name']
  passportArea: <class 'dict'> with keys ['id', 'alpha2code', 'alpha3code', 'name']
  currentTeamId: 0

COMPETITIONS: 7 records
Sample record:
  name: Italian first division
  wyId: 524
  format: Domestic league
  area: <class 'dict'> with keys ['name', 'id', 'alpha3code', 'alpha2code']
  type: club

PLAYERS: 3603 records
Sample record:
  passportArea: <class 'dict'> with keys ['name', 'id', 'alpha3code', 'alpha2code']
  weight: 78
  firstName: Harun
  middleName: 
  lastName: Tekin
  currentTeamId: 4502
  birthDate: 1989-06-17
  height: 187
  role: <class 'dict'> with keys ['code2', 'code3', 'name']
  birthArea: <class 'dict'> with keys ['name', 'id', 'alpha3code', 'alpha2code']
  wyId: 32777
  foot: right
  shortName: H

In [6]:
# ================================
# EXPLORE RELATIONSHIPS
# ================================

print("\n" + "="*50)
print("EXPLORING RELATIONSHIPS")
print("="*50)

# Get all unique IDs
team_ids = set(team['wyId'] for team in data.get('teams', []))
player_ids = set(player['wyId'] for player in data.get('players', []))
coach_ids = set(coach['wyId'] for coach in data.get('coaches', []))
referee_ids = set(referee['wyId'] for referee in data.get('referees', []))
competition_ids = set(comp['wyId'] for comp in data.get('competitions', []))

print(f"Teams: {len(team_ids)}")
print(f"Players: {len(player_ids)}")
print(f"Coaches: {len(coach_ids)}")
print(f"Referees: {len(referee_ids)}")
print(f"Competitions: {len(competition_ids)}")

# Check matches data
if data.get('matches'):
    print(f"\nMatches: {len(data['matches'])}")
    
    # Look at first match structure
    first_match = data['matches'][0]
    print("\nFirst match structure:")
    for key, value in first_match.items():
        print(f"  {key}: {value}")
    
    # Count teams in matches
    teams_in_matches = set()
    coaches_in_matches = set()
    competitions_in_matches = set()
    
    for match in data['matches'][:100]:  # Just first 100 for speed
        if 'teamsData' in match:
            for team_id in match['teamsData'].keys():
                teams_in_matches.add(int(team_id))
            for team_data in match['teamsData'].values():
                if 'coachId' in team_data:
                    coaches_in_matches.add(team_data['coachId'])
        if 'competitionId' in match:
            competitions_in_matches.add(match['competitionId'])
    
    print(f"\nIn first 100 matches:")
    print(f"  Unique teams referenced: {len(teams_in_matches)}")
    print(f"  Teams found in teams.json: {len(teams_in_matches & team_ids)}")
    print(f"  Unique coaches referenced: {len(coaches_in_matches)}")
    print(f"  Coaches found in coaches.json: {len(coaches_in_matches & coach_ids)}")
    print(f"  Competitions referenced: {len(competitions_in_matches)}")


EXPLORING RELATIONSHIPS
Teams: 142
Players: 3603
Coaches: 208
Referees: 0
Competitions: 7

Matches: 1941

First match structure:
  status: Played
  roundId: 4405654
  gameweek: 38
  teamsData: {'1646': {'scoreET': 0, 'coachId': 8880, 'side': 'home', 'teamId': 1646, 'score': 1, 'scoreP': 0, 'hasFormation': 1, 'formation': {'bench': [{'playerId': 77502, 'ownGoals': '0', 'redCards': '0', 'goals': 'null', 'yellowCards': '0'}, {'playerId': 270828, 'ownGoals': '0', 'redCards': '0', 'goals': 'null', 'yellowCards': '0'}, {'playerId': 9164, 'ownGoals': '0', 'redCards': '0', 'goals': 'null', 'yellowCards': '0'}, {'playerId': 8516, 'ownGoals': '0', 'redCards': '0', 'goals': 'null', 'yellowCards': '0'}, {'playerId': 9179, 'ownGoals': '0', 'redCards': '0', 'goals': 'null', 'yellowCards': '0'}, {'playerId': 532949, 'ownGoals': '0', 'redCards': '0', 'goals': 'null', 'yellowCards': '0'}, {'playerId': 9127, 'ownGoals': '0', 'redCards': '0', 'goals': 'null', 'yellowCards': '0'}], 'lineup': [{'playerId'

In [7]:
# ================================
# SPECIFIC DATA INSIGHTS
# ================================

print("\n" + "="*50)
print("DATA INSIGHTS")
print("="*50)

# Team info
if data.get('teams'):
    print(f"\nTEAMS:")
    countries = Counter(team.get('area', {}).get('name', 'Unknown') for team in data['teams'])
    print(f"  Top countries: {dict(countries.most_common(5))}")
    
    team_types = Counter(team.get('type', 'Unknown') for team in data['teams'])
    print(f"  Team types: {dict(team_types)}")

# Player info  
if data.get('players'):
    print(f"\nPLAYERS:")
    positions = Counter(player.get('role', {}).get('name', 'Unknown') for player in data['players'])
    print(f"  Positions: {dict(positions)}")
    
    nationalities = Counter(player.get('passportArea', {}).get('name', 'Unknown') for player in data['players'])
    print(f"  Top nationalities: {dict(nationalities.most_common(5))}")

# Competition info
if data.get('competitions'):
    print(f"\nCOMPETITIONS:")
    for comp in data['competitions']:
        print(f"  {comp['name']} ({comp.get('area', {}).get('name', 'Unknown')})")

# Match info
if data.get('matches'):
    print(f"\nMATCHES:")
    match_competitions = Counter(match.get('competitionId', 'Unknown') for match in data['matches'])
    print(f"  Matches per competition: {dict(match_competitions.most_common())}")
    
    match_statuses = Counter(match.get('status', 'Unknown') for match in data['matches'])
    print(f"  Match statuses: {dict(match_statuses)}")

# Events info
if data.get('events'):
    print(f"\nEVENTS (sample):")
    event_types = Counter(event.get('eventName', 'Unknown') for event in data['events'])
    print(f"  Event types: {dict(event_types.most_common(10))}")
    
    periods = Counter(event.get('matchPeriod', 'Unknown') for event in data['events'])
    print(f"  Match periods: {dict(periods)}")


DATA INSIGHTS

TEAMS:
  Top countries: {'Spain': 21, 'Italy': 21, 'England': 20, 'France': 20, 'Germany': 19}
  Team types: {'club': 98, 'national': 44}

PLAYERS:
  Positions: {'Goalkeeper': 426, 'Defender': 1200, 'Midfielder': 1257, 'Forward': 720}
  Top nationalities: {'Spain': 473, 'Italy': 450, 'France': 307, 'Germany': 245, 'England': 171}

COMPETITIONS:
  Italian first division (Italy)
  English first division (England)
  Spanish first division (Spain)
  French first division (France)
  German first division (Germany)
  European Championship ()
  World Cup ()

MATCHES:
  Matches per competition: {364: 380, 412: 380, 524: 380, 795: 380, 426: 306, 28: 64, 102: 51}
  Match statuses: {'Played': 1941}

EVENTS (sample):
  Event types: {'Pass': 513, 'Duel': 270, 'Others on the ball': 71, 'Free Kick': 61, 'Interruption': 44, 'Shot': 16, 'Foul': 13, 'Save attempt': 6, 'Offside': 5, 'Goalkeeper leaving line': 1}
  Match periods: {'1H': 901, '2H': 99}


In [8]:
# ================================
# SIMPLE QUERIES
# ================================

print("\n" + "="*50)
print("EXAMPLE QUERIES")
print("="*50)

# Find a specific team
print("\nFinding teams with 'Manchester' in name:")
if data.get('teams'):
    manchester_teams = [team for team in data['teams'] if 'manchester' in team['name'].lower()]
    for team in manchester_teams:
        print(f"  {team['name']} (ID: {team['wyId']})")

# Find players from a specific country
print("\nPlayers from Brazil (first 5):")
if data.get('players'):
    brazilian_players = [player for player in data['players'] 
                        if player.get('passportArea', {}).get('name') == 'Brazil'][:5]
    for player in brazilian_players:
        print(f"  {player['shortName']} - {player.get('role', {}).get('name', 'Unknown')}")

# Show a complete match example
print("\nExample match with team details:")
if data.get('matches'):
    sample_match = data['matches'][0]
    print(f"Match: {sample_match.get('label', 'Unknown')}")
    print(f"Date: {sample_match.get('date', 'Unknown')}")
    print(f"Venue: {sample_match.get('venue', 'Unknown')}")
    
    if 'teamsData' in sample_match:
        for team_id, team_data in sample_match['teamsData'].items():
            team_name = "Unknown"
            if data.get('teams'):
                team_info = next((t for t in data['teams'] if t['wyId'] == int(team_id)), None)
                if team_info:
                    team_name = team_info['name']
            
            print(f"  {team_name} ({team_data.get('side', 'unknown')}): {team_data.get('score', 0)}")

print("\n✅ Data exploration complete!")
print("\nYou can now access the data using:")
print("  data['teams'] - team information")
print("  data['players'] - player information") 
print("  data['matches'] - match data")
print("  data['events'] - sample events")
print("  etc.")


EXAMPLE QUERIES

Finding teams with 'Manchester' in name:
  Manchester City (ID: 1625)
  Manchester United (ID: 1611)

Players from Brazil (first 5):
  Marcelo - Defender
  Andr\u00e9 Ramalho - Defender
  Ot\u00e1vio - Midfielder
  Bernardo - Defender
  Jemerson - Defender

Example match with team details:
Match: Burnley - AFC Bournemouth, 1 - 2
Date: May 13, 2018 at 4:00:00 PM GMT+2
Venue: Turf Moor
  Burnley (home): 1
  AFC Bournemouth (away): 2

✅ Data exploration complete!

You can now access the data using:
  data['teams'] - team information
  data['players'] - player information
  data['matches'] - match data
  data['events'] - sample events
  etc.
