# Importing libraries

In [7]:
import pandas as pd
from common_fun import *

# Read parquet file

In [2]:
file_path = "../data/eng_1516_events.parquet"

In [3]:
df = pd.read_parquet(file_path)

In [6]:
group_pos = pd.read_csv('../data/expanded_positions.csv')

Saving column names

In [4]:
column_list = df.columns.tolist()

In [13]:
group_pos_df = pd.merge(
    df,
    group_pos,
    left_on='position',
    right_on='Position Name'
)
group_pos_df = group_pos_df.drop(columns=['Position Number','Position Abbreviation', 'Position Name'])
group_pos_df = group_pos_df.rename(columns={'Position Category': 'position_group_name'})

In [16]:
def get_player_pos_match(event_data: pd.DataFrame) -> pd.DataFrame:
    """Count how many times over all the matches each player assumed
    each position.

    Parameters
    ----------
    event_data : pd.DataFrame
        Statsbomb event data of all the matches

    Returns
    -------
    pd.DataFrame
        Dataframe with player_id, player, position_group_name, pos_count
    """
    player_pos = pd.DataFrame(columns=['match_id','player_id','player','position_group_name'])
    # Iterate over each match and save the players' positions
    for match_id, match_df in event_data.groupby(by='match_id'):
        curr_player_pos = match_df[['player_id','player','position_group_name']].drop_duplicates(ignore_index=True).dropna()
        curr_player_pos['match_id'] = match_id
        player_pos = pd.concat([player_pos, curr_player_pos], ignore_index=True)
    # Exclude the Substitute position
    player_pos = player_pos[player_pos['position_group_name'] != 'Substitute']
    # For each player count how many times he assumed each position
    player_pos = player_pos.groupby(by=['player_id','player']) \
                            ['position_group_name'].value_counts() \
                            .reset_index(name='pos_count')
    return player_pos

In [17]:
def get_player_position(event_data: pd.DataFrame) -> pd.DataFrame:
    """Get the main player's position. First, count how many times over
    matches a player had a certain position; then, keep only the position(s)
    with the maximum count (in case of tied values keep all the positions with
    the maximum). The main player's position is the position that he had one 
    time more of the other ones.

    Parameters
    ----------
    event_data : pd.DataFrame
        Statsbomb event data of all the matches

    Returns
    -------
    pd.DataFrame
        Dataframe with player_id, player_name, position_group_name
    """
    player_pos_count = get_player_pos_match(event_data=event_data)
    # Find the maximum position count for each player
    max_counts = player_pos_count.groupby(['player_id', 'player'])['pos_count'].transform('max')
    p_pos = player_pos_count[player_pos_count['pos_count'] == max_counts]
    p_pos = p_pos.drop(columns='pos_count')
    return p_pos

In [18]:
player_positions = get_player_position(event_data=group_pos_df)

  player_pos = pd.concat([player_pos, curr_player_pos], ignore_index=True)


In [19]:
player_positions.to_csv('../data/player_positions.csv', index=False)