In [3]:
from statsbombpy import sb
import pandas as pd
import warnings
from statsbombpy.api_client import NoAuthWarning
import plotly.io as pio
pio.renderers.default = 'notebook'  # or 'browser' or 'jupyterlab' depending on your environment

warnings.filterwarnings("ignore", category=NoAuthWarning)

In [4]:
# Load all competitions
comps = sb.competitions()

# Filter for the Men's FIFA World Cup 2022 (or 2018)
world_cup = comps[(comps['competition_name'] == 'FIFA World Cup') & (comps['season_name'] == '2022')]
# You can change season_name to '2018' if you prefer

# Get competition & season ID
comp_id = world_cup.iloc[0]['competition_id']
season_id = world_cup.iloc[0]['season_id']

world_cup


Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
29,43,106,International,FIFA World Cup,male,False,True,2022,2024-12-16T10:15:11.055845,2024-12-16T10:21:13.710934,2024-12-16T10:21:13.710934,2024-12-16T10:15:11.055845


In [5]:
# Load all matches in the World Cup
matches = sb.matches(competition_id=comp_id, season_id=season_id)

# Optionally filter for one team or match
# match_id = matches[matches['home_team'] == 'Brazil'].iloc[0]['match_id']

# Example: Get all match IDs
match_ids = matches['match_id'].tolist()

In [6]:
all_shots = []

for match_id in match_ids:
    events = sb.events(match_id=match_id)
    shots = events[events['type'] == 'Shot'][[
        'player', 'team', 'shot_statsbomb_xg', 'shot_outcome', 'minute'
    ]]
    shots['match_id'] = match_id
    all_shots.append(shots)

# Combine all shots into one DataFrame
shot_df = pd.concat(all_shots, ignore_index=True)

shot_df.head()

Unnamed: 0,player,team,shot_statsbomb_xg,shot_outcome,minute,match_id
0,Granit Xhaka,Switzerland,0.036566,Blocked,0,3857256
1,Breel-Donald Embolo,Switzerland,0.353289,Saved,0,3857256
2,Granit Xhaka,Switzerland,0.069527,Saved,0,3857256
3,Nikola Milenković,Serbia,0.081609,Off T,4,3857256
4,Andrija Živković,Serbia,0.030002,Post,10,3857256


In [7]:
# Rename for clarity
shot_df = shot_df.rename(columns={
    'shot_statsbomb_xg': 'xG',
    'shot_outcome': 'outcome'
})

# Create goal indicator
shot_df['is_goal'] = shot_df['outcome'].apply(lambda x: 1 if x == 'Goal' else 0)


# Aggregate per player
player_summary = shot_df.groupby(['player'], as_index=False).agg(
    avg_xG = ('xG','mean'),
    goals = ('is_goal','sum'),
    matches_played = ('match_id','nunique'),
    num_shots = ('xG','count')
)

# Step 3: Compute total expected goals
player_summary['expected_goals'] = player_summary['avg_xG'] * player_summary['num_shots']

# Step 4: Compute finishing differential
player_summary['finishing_diff'] = player_summary['goals'] - player_summary['expected_goals']
player_summary['finishing_ratio'] = player_summary['goals'] / player_summary['expected_goals']

player_summary.head()


Unnamed: 0,player,avg_xG,goals,matches_played,num_shots,expected_goals,finishing_diff,finishing_ratio
0,Aaron Mooy,0.039191,0,1,1,0.039191,-0.039191,0.0
1,Aaron Ramsey,0.025361,0,1,1,0.025361,-0.025361,0.0
2,Abdelhamid Sabiri,0.226721,1,2,4,0.906884,0.093116,1.102677
3,Abdelkarim Hassan Al Haj Fadlalla,0.045819,0,3,6,0.274915,-0.274915,0.0
4,Abderrazak Hamdallah,0.271498,0,2,2,0.542997,-0.542997,0.0


In [37]:
import plotly.express as px

# Bar chart of finishing differential
top = player_summary.sort_values('finishing_diff', ascending=False).head(10)

# Create interactive bar chart
fig = px.bar(
    top,
    x='finishing_diff',
    y='player',
    orientation='h',
    color='player',
    hover_data={
        'goals': True,
        'avg_xG': True,
        'avg_xG': ':.2f',
    },
    labels={
        'finishing_diff': 'Finishing Differential',
        'player_name': 'Player'
    },
    title='Top Overperformers in 2022 World Cup'
)

fig.update_layout(yaxis={'categoryorder':'total ascending','title':'Player'},title_x=0.5,legend_title_text='Player')

# Set custom x-axis ticks
fig.update_xaxes(
    tick0=0,
    dtick=0.5
)

fig.show()


In [38]:
fig.write_html('Top Overperformers in World Cup.html')

In [35]:
import plotly.graph_objects as go

top_xg = player_summary.sort_values('avg_xG', ascending=False).head(10)

# Create interactive bar chart
fig = px.bar(
    top_xg,
    x='avg_xG',
    y='player',
    orientation='h',
    color='player',
    hover_data=['goals', 'expected_goals','num_shots'],
    labels={'avg_xG': 'Average xG', 'player': 'Player'},
    title='Top Players by Shot Quality Opportunity'
)

fig.update_layout(yaxis={'categoryorder':'total ascending'},title_x=0.5)  # ensures highest xG on top
fig.show()

In [19]:
# Filter players with sufficient shots
filtered = player_summary[player_summary['num_shots'] >= 5].copy()

fig1 = px.scatter(
    filtered,
    x='expected_goals',
    y='goals',
    size='num_shots',
    hover_name='player',
    hover_data={'expected_goals': True, 'finishing_diff': True, 'goals': True, 'num_shots': True,'avg_xG':True},
    labels={
        'expected_goals': 'Expected Goals',
        'goals': 'Actual Goals Scored'
    },
    title='Goals vs Expected Goals',
    template='simple_white'
)

max_val = max(filtered['expected_goals'].max(), filtered['goals'].max()) + 0.5

#add y=x diagonal
fig1.add_shape(
    type='line',
    x0=0,
    y0=0,
    x1=max_val,
    y1=max_val,
    xref='x',
    yref='y',
    line=dict(
        color='black',
        dash='dash',
        width=5
    )
)

fig1.update_traces(marker=dict(color='mediumseagreen', line=dict(width=1, color='black')))
fig1.update_layout(height=650, 
                   width=900,
                   title_x=0.7,
                   xaxis=dict(
                    showgrid=True,
                    gridcolor='lightgray',
                    gridwidth=1
                    ),
                   yaxis=dict(
                    showgrid=True,
                    gridcolor='lightgray',
                    gridwidth=1
                    )
                  )
fig1.show()

In [20]:
fig2 = px.scatter(
    filtered,
    x='expected_goals',
    y='finishing_ratio',
    size='num_shots',
    hover_name='player',
    hover_data={'expected_goals': True, 'finishing_ratio': True, 'goals': True, 'num_shots': True},
    labels={
        'expected_goals': 'Expected Goals',
        'finishing_ratio': 'Finishing Ratio (Goals / xG)'
    },
    title='Finishing Ratio vs Expected Goals',
    template='simple_white'
)

# Add league average line at finishing_ratio = 1.0
fig2.add_hline(y=1.0, line_dash='dash', line_color='black', line_width=3)

# Final layout styling
fig2.update_layout(
    title=dict(text='Finishing Ratio vs Expected Goals', x=0.5),
    width=800,
    height=650,
    xaxis_title='Expected Goals',
    yaxis_title='Finishing Ratio (Goals / Expected Goals)',
    xaxis=dict(
        showgrid=True,
        gridcolor='lightgray'
        ),
    yaxis=dict(
        showgrid=True,
        gridcolor='lightgray'
        )
)

fig2.update_traces(marker=dict(color='red',line=dict(width=1, color='black')))
fig2.show()