# Importing libraries

In [1]:
import pandas as pd
import numpy as np
from common_fun import *

# Read parquet file

In [2]:
file_path = "../data/eng_1516_events.parquet"

In [3]:
df = pd.read_parquet(file_path)

### Saving column names

In [4]:
column_list = df.columns.tolist()

### Utility functions

In [5]:
players_df = df[["player_id", "player"]].drop_duplicates().dropna()

In [6]:
def head_col(col_name: str, df: pd.DataFrame) -> pd.DataFrame:
    return df[~df[col_name].isna()].dropna(axis=1)

# Task 1

## Which player attempted the most shots?

In [10]:
player_shot_df = (
    df[df["type"] == "Shot"]
    .groupby(by=["player_id", "player"])
    .size()
    .reset_index(name="shot_count")
    .sort_values(by="shot_count", ascending=False, ignore_index=True)
)

In [11]:
player_shot_df.head()

Unnamed: 0,player_id,player,shot_count
0,10955.0,Harry Kane,158
1,3237.0,Sergio Leonel Agüero del Castillo,118
2,10960.0,Jamie Vardy,118
3,3289.0,Romelu Lukaku Menama,116
4,5458.0,Odion Jude Ighalo,111


## Which team created the most expected goals?

In [12]:
team_xg_df = (
    df.groupby(by=["team_id", "team"])
    .agg({"shot_statsbomb_xg": "sum"})
    .reset_index()
    .sort_values(by="shot_statsbomb_xg", ascending=False)
)

In [13]:
team_xg_df.head()

Unnamed: 0,team_id,team,shot_statsbomb_xg
1,22,Leicester City,66.282171
0,1,Arsenal,65.136126
12,36,Manchester City,64.555055
14,38,Tottenham Hotspur,62.356203
3,24,Liverpool,58.373657


In [14]:
most_xg_team = int(team_xg_df.iloc[0].team_id)

## How many different players were awarded a yellow card?

In [15]:
yellow_count = df[
    (df["foul_committed_card"] == "Yellow Card") | 
    (df["bad_behaviour_card"] == "Yellow Card")]["player_id"].nunique()
yellow_count

365

## Which Liverpool player assisted the most shots with their right foot?

In [16]:
chosen_team = df[df["team"] == "Liverpool"]["team_id"].unique()[0]
chosen_team

np.int64(24)

I took into consideration both the goals and the shots that did not convert to goals.

In [17]:
cb_assist_p_df = (
    df[
        ((df["pass_shot_assist"]) | (df["pass_goal_assist"]))
        & (df["pass_body_part"] == "Right Foot")
        & (df["team_id"] == chosen_team)                        # Chosen team id
    ]
    .groupby(by=["player_id", "player"])
    .size()
    .reset_index(name="shot_assist_count")
    .sort_values(by="shot_assist_count", ascending=False)
)

In [18]:
cb_assist_p_df.head()

Unnamed: 0,player_id,player,shot_assist_count
2,3473.0,James Philip Milner,42
6,3501.0,Philippe Coutinho Correia,41
10,3535.0,Roberto Firmino Barbosa de Oliveira,31
13,4090.0,Adam David Lallana,27
15,4590.0,Nathaniel Edwin Clyne,25


## Which Liverpool player applied the most counterpressures?

In [19]:
counterpress_player_df = (
    df[(df["counterpress"]) & (df["team_id"] == chosen_team)]
    .groupby(by=["player_id", "player"])
    .size()
    .reset_index(name="counterpressure_count")
    .sort_values(by="counterpressure_count", ascending=False)
)

In [20]:
counterpress_player_df.head()

Unnamed: 0,player_id,player,counterpressure_count
18,7780.0,Lucas Pezzini Leiva,257
2,3473.0,James Philip Milner,252
4,3493.0,Emre Can,248
10,3535.0,Roberto Firmino Barbosa de Oliveira,240
13,4090.0,Adam David Lallana,211


# Task 2

## Read players positions

In [7]:
player_positions = pd.read_csv('../data/player_positions.csv')

In [13]:
def get_position_metric(
        position: str, player_positions: pd.DataFrame, metric_df: pd.DataFrame
) -> pd.DataFrame:
    chosen_pos_players = player_positions[player_positions['position_group_name'] == position]
    metric_df_copy = metric_df.copy()
    metric_df_copy = metric_df_copy.drop(columns='player')
    metric_df_copy = pd.merge(
        metric_df_copy,
        chosen_pos_players,
        on='player_id'
    )
    return metric_df_copy

## Aerial Wins

In [9]:
def get_aerial_wins(df: pd.DataFrame) -> pd.DataFrame:
    """Extract the number of aerial wins for player.

    Parameters
    ----------
    df : pd.DataFrame
        Event data

    Returns
    -------
    pd.DataFrame
        DataFrame with player_id, aerial_win_count as columns
    """
    return (
        df[
            (df["clearance_aerial_won"])
            | (df["miscontrol_aerial_won"])
            | (df["pass_aerial_won"])
            | (df["shot_aerial_won"])
        ]
        .groupby(by=["player_id","player"])
        .size()
        .reset_index(name="aerial_win_count")
        .sort_values(by="aerial_win_count", ascending=False, ignore_index=True)
    )

In [10]:
aerial_win_players = get_aerial_wins(df=df)

### Best 5 midfielders according to Aerial Wins

In [14]:
mid_aerial_win = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=aerial_win_players
)
mid_aerial_win.head(5)

Unnamed: 0,player_id,aerial_win_count,player,position_group_name
0,5489.0,121,Mile Jedinak,Midfielder
1,3684.0,111,Cheikhou Kouyaté,Midfielder
2,10956.0,106,Eric Dier,Midfielder
3,3098.0,87,Victor Wanyama,Midfielder
4,3462.0,85,Jack Frank Porteous Cork,Midfielder


## Aerial Win Percentage

In [16]:
def get_aerial_wins_perc(df: pd.DataFrame) -> pd.DataFrame:
    """First, extract the Aerial Win count with the method 'get_aerial_wins'.
    Then, extract the number of lost aerial duels using the feature 'duel_type'
    with value "Aerial Lost" (id = 10); I did not use the feature 'is_aerial_won' = False
    because it is set to false also for all the events, even those which are not duels.
    Eventually, compute the percentage.

    Parameters
    ----------
    df : pd.DataFrame
        Event data

    Returns
    -------
    pd.DataFrame
        DataFrame with player_id, aerial_win_percentage
    """
    # Get the Aerial wins for each player with the get_aerial_wins method
    aerial_wins_df = get_aerial_wins(df=df)
    # Get only the aerial lost events
    aerial_lost_df = (
        df[
            (df["duel_type"] == "Aerial Lost")  # Duel of type "Aerial Lost"
        ]
        .groupby(by="player_id")
        .size()
        .reset_index(name="aerial_lost_count")
    )
    # Join the two dataframes with aerial wins and lost counts
    aerial_w_l_df = pd.merge(
        left=aerial_wins_df, right=aerial_lost_df, how="inner", on="player_id"
    )
    # Compute the percentage
    aerial_w_l_df["aerial_total_count"] = (
        aerial_w_l_df["aerial_win_count"] + aerial_w_l_df["aerial_lost_count"]
    )
    aerial_w_l_df["aerial_win_percentage"] = (
        aerial_w_l_df["aerial_win_count"] / aerial_w_l_df["aerial_total_count"]
    )

    aerial_w_l_df = aerial_w_l_df.drop(
        columns=["aerial_win_count", "aerial_lost_count", "aerial_total_count"]
    )

    return aerial_w_l_df.sort_values(
        by="aerial_win_percentage", ascending=False, ignore_index=True
    )

In [17]:
aerial_win_perc_players = get_aerial_wins_perc(df=df)

In [19]:
mid_aerial_win_perc = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=aerial_win_perc_players
)
mid_aerial_win_perc.head()

Unnamed: 0,player_id,aerial_win_percentage,player,position_group_name
0,20067.0,0.777778,Darron Gibson,Midfielder
1,20951.0,0.75,Jordan Lyden,Midfielder
2,42703.0,0.714286,Mikel Arteta Amatriain,Midfielder
3,3636.0,0.7,Charlie Adam,Midfielder
4,4804.0,0.7,Bradley Johnson,Midfielder


## Long Balls

Make sure that the completed passes have the outcome feature equal to null.

In [20]:
def get_long_balls(df: pd.DataFrame) -> pd.DataFrame:
    """Extract the completed passes with length more than 35 yards.
    The completed passes are those with the outcome equal to null, as specified on
    https://statsbomb.com/wp-content/uploads/2022/08/Working-with-R.pdf.

    Parameters
    ----------
    df : pd.DataFrame
        Event data

    Returns
    -------
    pd.DataFrame
        DataFrame with player_id, long_balls
    """
    return (
        df[
            (df["type"] == "Pass")  # Pass id
            & (df["pass_length"] >= 35)
            & (df["pass_outcome"].isna())
        ]
        .groupby(by=["player_id","player"])
        .size()
        .reset_index(name="long_balls")
        .sort_values(by="long_balls", ascending=False, ignore_index=True)
    )

In [21]:
long_balls_players = get_long_balls(df=df)

In [22]:
mid_long_balls_players = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=long_balls_players
)
mid_long_balls_players.head()

Unnamed: 0,player_id,long_balls,player,position_group_name
0,3478.0,250,Francesc Fàbregas i Soler,Midfielder
1,3344.0,243,Andrew Surman,Midfielder
2,3057.0,203,Jonjo Shelvey,Midfielder
3,3633.0,181,Gareth Barry,Midfielder
4,3583.0,178,Ashley Westwood,Midfielder


## Final Pass

*Final Pass* is defined as the ability to create goal-scoring opportunities for teammates by passing the ball.

I used the following metrics to assess the players' Final Pass capabilities:
- Assists.
- xG Assisted.
- Key passes.
- Key passes under pressure.
- Progressive passes.



### Assist

In [23]:
def get_assist(df: pd.DataFrame) -> pd.DataFrame:
    """Get the number of goals assisted.

    Parameters
    ----------
    df : pd.DataFrame
        Event data

    Returns
    -------
    pd.DataFrame
        DataFrame with player_id, assists
    """
    return (
        df[df["pass_goal_assist"]==True]
        .groupby(by=["player_id", "player"])
        .size()
        .reset_index(name="assists")
        .sort_values(by="assists", ascending=False, ignore_index=False)
    )

In [24]:
assist_players = get_assist(df=df)

In [25]:
mid_assist_players = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=assist_players
)
mid_assist_players.head(10)

Unnamed: 0,player_id,assists,player,position_group_name
0,3496.0,19,Mesut Özil,Midfielder
1,3473.0,11,James Philip Milner,Midfielder
2,3064.0,10,David Josué Jiménez Silva,Midfielder
3,3814.0,10,Riyad Mahrez,Midfielder
4,3089.0,9,Kevin De Bruyne,Midfielder
5,4275.0,8,Ross Barkley,Midfielder
6,3094.0,8,Bamidele Alli,Midfielder
7,3478.0,7,Francesc Fàbregas i Soler,Midfielder
8,3307.0,7,Marc Albrighton,Midfielder
9,3535.0,7,Roberto Firmino Barbosa de Oliveira,Midfielder


### xG assisted

In [26]:
def get_xga(df: pd.DataFrame) -> pd.DataFrame:
    """Compute the xG of the shots assisted by each player.
    Use the 'pass_assisted_shot_id' feature to have the corresponding shot for each assist.

    Parameters
    ----------
    df : pd.DataFrame
        Event data

    Returns
    -------
    pd.DataFrame
        DataFrame with player_id, xg_assist
    """
    support_df = pd.merge(
        left=df,
        right=df,
        left_on="pass_assisted_shot_id",
        right_on="id",
        how="inner",
        suffixes=["_assist", "_shot"],
    )

    return (
        support_df.groupby(by=["player_id_assist", "player_assist"])
        .agg({"shot_statsbomb_xg_shot": "sum"})
        .reset_index()
        .rename(
            columns={
                "player_id_assist": "player_id",
                "player_assist": "player",
                "shot_statsbomb_xg_shot": "xg_assist",
            }
        )
        .sort_values(by="xg_assist", ascending=False, ignore_index=True)
    )

In [27]:
xga_players = get_xga(df=df)

In [28]:
mid_xga_players = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=xga_players
)
mid_xga_players.head(10)

Unnamed: 0,player_id,xg_assist,player,position_group_name
0,3496.0,13.824563,Mesut Özil,Midfielder
1,3814.0,9.525558,Riyad Mahrez,Midfielder
2,3089.0,9.401009,Kevin De Bruyne,Midfielder
3,3478.0,8.067358,Francesc Fàbregas i Soler,Midfielder
4,3307.0,7.513881,Marc Albrighton,Midfielder
5,3535.0,6.706736,Roberto Firmino Barbosa de Oliveira,Midfielder
6,4275.0,6.521117,Ross Barkley,Midfielder
7,3094.0,5.992854,Bamidele Alli,Midfielder
8,4090.0,5.795495,Adam David Lallana,Midfielder
9,3091.0,5.639088,Moussa Sissoko,Midfielder


### Key Passes

In [29]:
def get_key_passes(df: pd.DataFrame) -> pd.DataFrame:
    """Extract the passes that lead to a shot or a goal, called key passes.

    Parameters
    ----------
    df : pd.DataFrame
        Event data dataframe

    Returns
    -------
    pd.DataFrame
        Dataframe with player_id and the number of key passes
    """
    return (
        df[
            (df["pass_goal_assist"]) | (df["pass_shot_assist"])
        ]
        .groupby(by=["player_id", "player"])
        .size()
        .reset_index(name="key_passes")
        .sort_values(by="key_passes", ascending=False, ignore_index=True)
    )

In [30]:
key_passes_players = get_key_passes(df=df)

In [31]:
mid_key_passes_players = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=key_passes_players
)
mid_key_passes_players.head(10)

Unnamed: 0,player_id,key_passes,player,position_group_name
0,3496.0,19,Mesut Özil,Midfielder
1,3473.0,11,James Philip Milner,Midfielder
2,3064.0,10,David Josué Jiménez Silva,Midfielder
3,3814.0,10,Riyad Mahrez,Midfielder
4,3089.0,9,Kevin De Bruyne,Midfielder
5,4275.0,8,Ross Barkley,Midfielder
6,3094.0,8,Bamidele Alli,Midfielder
7,3478.0,7,Francesc Fàbregas i Soler,Midfielder
8,3307.0,7,Marc Albrighton,Midfielder
9,3535.0,7,Roberto Firmino Barbosa de Oliveira,Midfielder


### Under pressure key passes

In [33]:
def get_under_pressure_key_passes(df: pd.DataFrame) -> pd.DataFrame:
    """Extract the passes that lead to a shot or a goal, called key passes,
    considering only those played under pressure.

    Parameters
    ----------
    df : pd.DataFrame
        Event data dataframe

    Returns
    -------
    pd.DataFrame
        Dataframe with player_id and the number of key passes under pressure
        
    """
    return (
        df[
            ((df["pass_goal_assist"]) | (df["pass_shot_assist"]))
            & (df["under_pressure"] == True)
        ]
        .groupby(by=["player_id", "player"])
        .size()
        .reset_index(name="under_pressure_key_passes")
        .sort_values(by="under_pressure_key_passes", ascending=False, ignore_index=True)
    )

In [34]:
under_pressure_key_passes_players = get_under_pressure_key_passes(df=df)

In [35]:
mid_under_pressure_key_passes_players = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=under_pressure_key_passes_players
)
mid_under_pressure_key_passes_players.head(10)

Unnamed: 0,player_id,under_pressure_key_passes,player,position_group_name
0,3094.0,4,Bamidele Alli,Midfielder
1,4090.0,3,Adam David Lallana,Midfielder
2,4275.0,2,Ross Barkley,Midfielder
3,3567.0,2,Georginio Wijnaldum,Midfielder
4,3473.0,2,James Philip Milner,Midfielder
5,40122.0,2,Oscar dos Santos Emboaba Júnior,Midfielder
6,10966.0,1,Wes Hoolahan,Midfielder
7,4416.0,1,Max-Alain Gradel,Midfielder
8,4429.0,1,Yann Gérard M''Vila,Midfielder
9,16457.0,1,Jeremain Lens,Midfielder


### Progressive passes

In [None]:
def get_progressive_passes(df: pd.DataFrame) -> pd.DataFrame:
    """Completed open-play passes that move at least 25% closer to the goal from its origin.
    Exclude passes from the defending 40% of the pitch.

    Parameters
    ----------
    df : pd.DataFrame
        Statsbomb Event data

    Returns
    -------
    pd.DataFrame
    """
    df_copy = df.copy()
    df_copy = df_copy.dropna(subset='location')
    passes = df_copy[
        (df_copy["type"] == "Pass")
        & (df_copy["pass_outcome"].isna())  # Only completed passes
        & (df_copy["pass_type"].isna())     # Only open-play passes
    ]
    # Excludes passes from the defending 40% of the pitch
    x_not_def = 120 * 0.4
    passes_not_def = passes[passes["location"].apply(lambda x: x[0] > x_not_def)]
    # Compute distance from the opposition goal for both origin and destination of the pass
    passes_not_def["dist_origin"] = passes_not_def["location"].apply(
        lambda loc: np.sqrt(np.square(120 - loc[0]) + np.square(40 - loc[1]))
    )
    passes_not_def["dist_dest"] = passes_not_def["pass_end_location"].apply(
        lambda loc: np.sqrt(np.square(120 - loc[0]) + np.square(40 - loc[1]))
    )
    # Compute the ratio between dist_dest and dist_origin
    passes_not_def["ratio_dist"] = (
        passes_not_def["dist_dest"] / passes_not_def["dist_origin"]
    )
    passes_not_def["is_progressive"] = passes_not_def["ratio_dist"] < 0.75
    # Progressive passes only those passes that approach the opposition goal line of the 25%
    prog_passes = passes_not_def[passes_not_def["is_progressive"]]

    return (
        prog_passes.groupby(by=["player_id","player"])
        .size()
        .reset_index(name="prog_pass")
        .sort_values(by="prog_pass", ascending=False, ignore_index=True)
    )

In [37]:
prog_pass_players = get_progressive_passes(df=df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  passes_not_def["dist_origin"] = passes_not_def["location"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  passes_not_def["dist_dest"] = passes_not_def["pass_end_location"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  passes_not_def["ratio_dist"] = (
A value is trying to be set on a 

In [38]:
mid_prog_pass_players = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=prog_pass_players
)
mid_prog_pass_players.head(10)

Unnamed: 0,player_id,prog_pass,player,position_group_name
0,3478.0,265,Francesc Fàbregas i Soler,Midfielder
1,3496.0,259,Mesut Özil,Midfielder
2,3633.0,181,Gareth Barry,Midfielder
3,3473.0,156,James Philip Milner,Midfielder
4,3517.0,154,Aaron Ramsey,Midfielder
5,4751.0,154,Michael Carrick,Midfielder
6,3874.0,144,Mark Noble,Midfielder
7,3828.0,136,Steven Davis,Midfielder
8,4429.0,134,Yann Gérard M''Vila,Midfielder
9,3049.0,134,Matt Ritchie,Midfielder


### Progressive key passes

In [60]:
def get_progressive_key_passes(df: pd.DataFrame) -> pd.DataFrame:
    """Completed open-play passes that move at least 25% closer to the goal from its origin
    and that lead to a shot or a goal. Exclude passes from the defending 40% of the pitch.

    Parameters
    ----------
    df : pd.DataFrame
        Statsbomb Event data

    Returns
    -------
    pd.DataFrame
    """
    df_copy = df.copy()
    df_copy = df_copy.dropna(subset='location')
    passes = df_copy[
        (df_copy["type"] == "Pass")
        & (df_copy["pass_outcome"].isna())                      # Only completed passes
        & (df_copy["pass_type"].isna())                         # Only open-play passes
        & ((df_copy["pass_goal_assist"]) | (df_copy["pass_shot_assist"])) # Key pass
    ]
    # Excludes passes from the defending 40% of the pitch
    x_not_def = 120 * 0.4
    passes_not_def = passes[passes["location"].apply(lambda x: x[0] > x_not_def)].copy()
    # Compute distance from the opposition goal for both origin and destination of the pass
    passes_not_def["dist_origin"] = passes_not_def["location"].apply(
        lambda loc: np.sqrt(np.square(120 - loc[0]) + np.square(40 - loc[1]))
    )
    passes_not_def["dist_dest"] = passes_not_def["pass_end_location"].apply(
        lambda loc: np.sqrt(np.square(120 - loc[0]) + np.square(40 - loc[1]))
    )
    # Compute the ratio between dist_dest and dist_origin
    passes_not_def["ratio_dist"] = (
        passes_not_def["dist_dest"] / passes_not_def["dist_origin"]
    )
    passes_not_def["is_progressive"] = passes_not_def["ratio_dist"] < 0.75
    # Progressive passes only those passes that approach the opposition goal line of the 25%
    prog_passes = passes_not_def[passes_not_def["is_progressive"]]

    return (
        prog_passes.groupby(by=["player_id","player"])
        .size()
        .reset_index(name="prog_pass")
        .sort_values(by="prog_pass", ascending=False, ignore_index=True)
    )

In [61]:
prog_key_pass_players = get_progressive_key_passes(df=df)

mid_prog_pass_key_players = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=prog_key_pass_players
)
mid_prog_pass_key_players.head(10)

Unnamed: 0,player_id,prog_pass,player,position_group_name
0,3496.0,9,Mesut Özil,Midfielder
1,3473.0,7,James Philip Milner,Midfielder
2,3089.0,6,Kevin De Bruyne,Midfielder
3,3091.0,6,Moussa Sissoko,Midfielder
4,3629.0,5,Sadio Mané,Midfielder
5,3814.0,5,Riyad Mahrez,Midfielder
6,10966.0,5,Wes Hoolahan,Midfielder
7,4275.0,5,Ross Barkley,Midfielder
8,42902.0,4,Adam Johnson,Midfielder
9,3307.0,4,Marc Albrighton,Midfielder


## Box Positioning

*Box Positioning* is defined as the ability to move into goal-scoring
positions in valuable pitch locations by clever positioning in and around the opponent
penalty area at the right time.

I considered the number of ball receipts in the penalty box and in the Zone 14, i.e., the zone located in the middle of the pitch immediately outside the penalty area appears crucial for goal scoring (Taylor et al., 2002), that indicates how well players are positioned near the opposition goal to receive the passes. Then, the number of non-penalty expected goals indicates how dangerous players are positioned when they shot the ball.

### Penalty box receipts

In [41]:
def get_penalty_box_receipts(df: pd.DataFrame) -> pd.DataFrame:
    succ_ball_rec = df[
        (df["ball_receipt_outcome"].isna())  # Successful receipt
        & (df["type"] == "Ball Receipt*")
    ]
    
    pen_box_receipts = succ_ball_rec[
        succ_ball_rec["location"].apply(
            lambda loc: loc[0] >= 102 and loc[1] >= 18 and loc[1] >= 62
    )]

    return (
        pen_box_receipts.groupby(by=["player_id", "player"])
        .size()
        .reset_index(name="penalty_receipts")
        .sort_values(by="penalty_receipts", ascending=False, ignore_index=True)
    )

In [42]:
penalty_box_receipts_players = get_penalty_box_receipts(df=df)

In [43]:
mid_penalty_box_receipts_players = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=penalty_box_receipts_players
)
mid_penalty_box_receipts_players.head(10)

Unnamed: 0,player_id,penalty_receipts,player,position_group_name
0,3091.0,105,Moussa Sissoko,Midfielder
1,3049.0,92,Matt Ritchie,Midfielder
2,3286.0,79,Nathan Redmond,Midfielder
3,3814.0,77,Riyad Mahrez,Midfielder
4,2966.0,75,Stéphane Sessègnon,Midfielder
5,3473.0,73,James Philip Milner,Midfielder
6,3089.0,69,Kevin De Bruyne,Midfielder
7,3629.0,63,Sadio Mané,Midfielder
8,3293.0,51,Jesse Lingard,Midfielder
9,3496.0,50,Mesut Özil,Midfielder


### Zone 14 receipts

In [44]:
def get_zone14_receipts(df: pd.DataFrame) -> pd.DataFrame:
    zone_14_x1 = (120 / 6) * 4
    zone_14_x2 = (120 / 6) * 5
    zone_14_y1 = 80 / 3
    zone_14_y2 = (80 / 3) * 2
    # Consider only the events in the Zone 14
    succ_ball_rec = df[
        (df["ball_receipt_outcome"].isna())  # Successful receipt
        & (df["type"] == "Ball Receipt*")
    ]
    zone_14_receipts_df = succ_ball_rec[
        succ_ball_rec["location"].apply(
            lambda loc: loc[0] >= zone_14_x1 and
                        loc[0] <= zone_14_x2 and
                        loc[1] >= zone_14_y1 and
                        loc[1] <= zone_14_y2
        )
    ]

    return (
        zone_14_receipts_df.groupby(by=["player_id", "player"])
        .size()
        .reset_index(name="zone14_receipts")
        .sort_values(by="zone14_receipts", ascending=False, ignore_index=True)
    )

In [45]:
zone14_receipts_players = get_zone14_receipts(df=df)

In [46]:
mid_zone14_receipts_players = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=zone14_receipts_players
)
mid_zone14_receipts_players.head(10)

Unnamed: 0,player_id,zone14_receipts,player,position_group_name
0,3496.0,163,Mesut Özil,Midfielder
1,4275.0,154,Ross Barkley,Midfielder
2,3094.0,153,Bamidele Alli,Midfielder
3,3478.0,149,Francesc Fàbregas i Soler,Midfielder
4,3087.0,139,Gnégnéri Yaya Touré,Midfielder
5,3944.0,138,Gylfi Þór Sigurðsson,Midfielder
6,3517.0,135,Aaron Ramsey,Midfielder
7,11386.0,93,Santiago Cazorla González,Midfielder
8,3629.0,92,Sadio Mané,Midfielder
9,3567.0,91,Georginio Wijnaldum,Midfielder


### Non-penalty xG

In [47]:
def get_tot_players_statsbomb_openplayxg(df: pd.DataFrame) -> pd.DataFrame:
    """Compute the total number of open-play expected goals of each player.

    Parameters
    ----------
    df : pd.DataFrame
        The Statsbomb event data

    Returns
    -------
    pd.DataFrame
        player_id, tot_npxg
    """
    return (
        df[df['shot_type'] == "Open Play"]
        .groupby(by=["player_id", "player"])
        .agg({"shot_statsbomb_xg": "sum"})
        .reset_index()
        .rename(columns={"shot_statsbomb_xg": "tot_npxg"})
        .sort_values(by="tot_npxg", ascending=False, ignore_index=True)
    )

In [48]:
openplay_xg_players = get_tot_players_statsbomb_openplayxg(df=df)

In [50]:
mid_openplay_xg_players = get_position_metric(
    position='Midfielder',
    player_positions=player_positions,
    metric_df=openplay_xg_players
)
mid_openplay_xg_players.head(10)

Unnamed: 0,player_id,tot_npxg,player,position_group_name
0,3629.0,11.268578,Sadio Mané,Midfielder
1,3094.0,8.398463,Bamidele Alli,Midfielder
2,3517.0,7.420695,Aaron Ramsey,Midfielder
3,3535.0,7.302322,Roberto Firmino Barbosa de Oliveira,Midfielder
4,3814.0,7.162665,Riyad Mahrez,Midfielder
5,3567.0,6.79159,Georginio Wijnaldum,Midfielder
6,4275.0,5.740484,Ross Barkley,Midfielder
7,3496.0,5.661124,Mesut Özil,Midfielder
8,3684.0,5.440102,Cheikhou Kouyaté,Midfielder
9,3087.0,4.631859,Gnégnéri Yaya Touré,Midfielder


### Final Pass and Box Positioning evaluation

All the metrics to assess Final Pass and Box Positioning capabilities are rescaled between 0 and 1 and averaged to get an unique metric, called *fin_pass_box_pos*.

In [51]:
def join_metrics(metric_df_list: list[pd.DataFrame]) -> pd.DataFrame:
    """Method to join together all the Final Pass and Box Positioning metrics.

    Parameters
    ----------
    metric_df_list : list[pd.DataFrame]
        List of the Final Pass and Box Positioning metrics

    Returns
    -------
    Optional[pd.DataFrame]
        A DataFrame with the player id and all the corresponding Final Pass and Box Positioning metrics
    """
    if len(metric_df_list) < 2:
        print("Add more metrics.")
        return pd.DataFrame()

    df_to_ret = metric_df_list[0]
    df_to_ret = df_to_ret.drop(columns=['player', 'position_group_name'])

    for i in range(1, len(metric_df_list)):
        df_to_ret = pd.merge(df_to_ret, metric_df_list[i], on="player_id", how="inner", suffixes=[f"_{i-1}", f"_{i}"])
        df_to_ret = df_to_ret.drop(columns=['player', 'position_group_name'])

    return df_to_ret

In [53]:
# List of the Final Pass and Box Positioning metrics
metric_df_list = [
    mid_assist_players,
    mid_xga_players,
    mid_key_passes_players,
    mid_under_pressure_key_passes_players,
    mid_prog_pass_players,
    mid_prog_pass_key_players,
    mid_penalty_box_receipts_players,
    mid_zone14_receipts_players,
    mid_openplay_xg_players,
]

In [55]:
# Join the dataframe with the single metrics
finalpass_boxpos_metrics = join_metrics(metric_df_list=metric_df_list)
# Save the player ids to later concatenate them with the rescaled metrics
player_id = finalpass_boxpos_metrics["player_id"]

finalpass_boxpos_metrics = finalpass_boxpos_metrics.drop(columns="player_id")
finalpass_boxpos_metrics = finalpass_boxpos_metrics.select_dtypes(exclude="object")
# Metrics rescaling
scaled_finalpass_boxpos_metrics = (
    finalpass_boxpos_metrics - finalpass_boxpos_metrics.min()
) / (finalpass_boxpos_metrics.max() - finalpass_boxpos_metrics.min())
# Get a unique metric that is the average of the other ones
fin_pass_box_pos = (
    scaled_finalpass_boxpos_metrics.mean(axis=1)
    .reset_index(name="fin_pass_box_pos")
    .drop(columns="index")
)
fin_pass_box_pos = pd.concat((fin_pass_box_pos, player_id), axis=1)
fin_pass_box_pos = fin_pass_box_pos.rename(columns={"player_0": "player"})
fin_pass_box_pos = fin_pass_box_pos.sort_values(
    by="fin_pass_box_pos", ascending=False, ignore_index=True
)

### The top 5 Premier League 2015/2016 midfielders according to Final Pass and Box Positioning capabilities

In [57]:
add_name_players(fin_pass_box_pos, players_df).head(5)

Unnamed: 0,fin_pass_box_pos,player_id,player
0,0.68049,3473.0,James Philip Milner
1,0.657006,3814.0,Riyad Mahrez
2,0.630234,3094.0,Bamidele Alli
3,0.607944,4275.0,Ross Barkley
4,0.529032,3091.0,Moussa Sissoko
