In [None]:
!git clone https://github.com/statsbomb/open-data.git

In [None]:
import json
import os
import altair as alt
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

In [None]:
with open('open-data/data/matches/72/30.json', 'r') as fo:
  j = fo.read()
matches = json_normalize(json.loads(j), meta='match_id')
# There should be 52 World Cup matches
assert len(matches) == 52, 'Did not load all matches! Only loaded {} matches.'.format(len(matches))

In [None]:
match_ids = matches.loc[:, 'match_id'].tolist()
df = pd.DataFrame()
for match_id in match_ids:
  fo = open('open-data/data/events/{}.json'.format(match_id), 'r')
  j = fo.read()
  fo.close()
  next_df = json_normalize(json.loads(j), meta='type', sep='_')
  df = pd.concat([df, next_df])

In [None]:
USA = 1214

usa_df = (
    df.loc[
        # Rows
        (df['duel_outcome_name'].isin(['Success In Play', 'Won', 'Success Out'])) &
        (df['team_id'] == USA),
        # Columns
        ['player_name', 'location']
    ]
)

usa_df['player_last_name'] = [n.split()[-1:][0] for n in usa_df['player_name']]

print(usa_df.shape)
print(usa_df.head())
print()

usa_legend_df = (
    usa_df.loc[:, ['player_name', 'player_last_name']]
        .groupby(['player_last_name'])
        .count()
        .reset_index()
)
usa_legend_df.columns = ['player_last_name', 'count']

print(usa_legend_df.shape)
print(usa_legend_df.head())

(87, 3)
              player_name      location player_last_name
171          Rose Lavelle  [81.6, 22.5]          Lavelle
269  Crystal Alyssia Dunn    [5.5, 2.3]             Dunn
628        Samantha Mewis   [58.0, 4.5]            Mewis
896           Tobin Heath  [77.6, 62.6]            Heath
948  Crystal Alyssia Dunn   [79.3, 3.4]             Dunn

(17, 2)
  player_last_name  count
0            Brian      1
1       Dahlkemper      3
2         Davidson      2
3             Dunn     14
4             Ertz      8


In [None]:
x_vals = alt.X('location[0]:Q', scale=alt.Scale(domain=[0, 120]), title=None)
y_vals = alt.Y('location[1]:Q', scale=alt.Scale(domain=[0, 80]), title=None)

selector = alt.selection_multi(fields=['player_last_name'])
color = alt.condition(selector,
                      alt.Color('player_last_name:N', legend=None),
                      alt.value('#FFFFFF00'))

pitch = (
    alt.Chart(usa_df)
        .properties(
            width=600,
            height=400,
            description='USA duels won')
        .mark_point()
        .encode(
            x=x_vals,
            y=y_vals,
            color=color)
)

lbls = (
    pitch
        .mark_text(
            align='left',
            baseline='middle',
            dx=7)
        .encode(
            text='player_last_name')
)

legend = (
    alt.Chart(usa_legend_df)
        .mark_point()
        .encode(
            y=alt.Y(
                shorthand='player_last_name:N',
                axis=alt.Axis(orient='right'),
                title='Click circle next to player name to highlight on chart',
                sort=alt.EncodingSortField(field='count', order='descending')),
            color=color)
        .add_selection(selector)
)

pitch + lbls | legend