In [89]:
pip install scipy

Collecting scipy
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl (44.5 MB)
   ---------------------------------------- 0.0/44.5 MB ? eta -:--:--
   ---------------------------------------- 0.5/44.5 MB 4.2 MB/s eta 0:00:11
   - -------------------------------------- 1.3/44.5 MB 3.4 MB/s eta 0:00:13
   - -------------------------------------- 1.8/44.5 MB 3.2 MB/s eta 0:00:14
   -- ------------------------------------- 2.6/44.5 MB 3.2 MB/s eta 0:00:14
   -- ------------------------------------- 3.1/44.5 MB 3.2 MB/s eta 0:00:13
   --- ------------------------------------ 3.9/44.5 MB 3.2 MB/s eta 0:00:13
   ---- ----------------------------------- 4.5/44.5 MB 3.2 MB/s eta 0:00:13
   ---- ----------------------------------- 5.2/44.5 MB 3.2 MB/s eta 0:00:13
   ----- ---------------------------------- 6.0/44.5 MB 3.2 MB/s eta 0:00:13
   ----- ---------------------------------- 6.6/44.5 MB 3.2 MB/s eta 0:00:12
   ------ --

In [90]:
import soccerdata as sd
import pandas as pd
import numpy as np
import scipy.stats as stats

In [2]:
# Setting vars
league = "GER-Bundesliga"
seasons = 2022
team = 'Dortmund'
player = 'Jude Bellingham'

In [3]:
# Creating object
fbref = sd.FBref(leagues=league, seasons=seasons)

In [6]:
# API Calls
# Getting season schedule for league
schedule = fbref.read_schedule().reset_index(drop=False)
team_match_stats = fbref.read_team_match_stats(stat_type="schedule", team=team).reset_index(drop=False)

# Getting games for team
home_games = schedule[schedule['home_team'] == team]['game_id']
away_games = schedule[schedule['away_team'] == team]['game_id']
games = list(set(home_games) | set(away_games))

# Getting lineups & events for team
events = fbref.read_events(match_id=games)
lineups = fbref.read_lineup(match_id=games)

In [12]:
# Calculating team-data
lineups = lineups[lineups['team'] == team].reset_index(drop=False)[['game', 'player', 'is_starter', 'minutes_played']]
players = list(lineups['player'].unique())

goals = events[events['event_type'] == 'goal'].reset_index(drop=False)[['game', 'team', 'minute']]
goals_for = goals[goals['team'] == team].reset_index(drop=True)
goals_against = goals[goals['team'] != team].reset_index(drop=True)

subs = events[events['event_type'] == 'substitute_in'].reset_index(drop=False)[['game', 'team', 'minute', 'player1', 'player2']]
club_subs = subs[subs['team'] == team].reset_index(drop=True)

In [135]:
def metrics_calculator(player):

    # Calculating playing time for player
    player_in_club_subs = club_subs[club_subs['player1'] == player][['game', 'minute']].rename(columns={'minute': 'minute_in'}).reset_index(drop=True)
    player_out_club_subs = club_subs[club_subs['player2'] == player][['game', 'minute']].rename(columns={'minute': 'minute_out'}).reset_index(drop=True)

    player_subs = pd.merge(player_in_club_subs, player_out_club_subs, on='game', how='outer')

    player_starts = schedule[schedule['game_id'].isin(games)].reset_index(drop=True)[['game', 'game_id']].merge(lineups[lineups['player'] == player][['game', 'player', 'is_starter']], how='left')

    player_games = player_starts.merge(player_subs, on = 'game', how='left')

    player_starts = schedule[schedule['game_id'].isin(games)].reset_index(drop=True)[['game', 'game_id']].merge(lineups[lineups['player'] == player][['game', 'player', 'is_starter']], how='left')

    player_games = player_starts.merge(player_subs, on = 'game', how='left')

    player_games['minute_out'] = player_games.apply(
        lambda row: '0' if pd.isnull(row['player']) or (pd.isnull(row['minute_in']) and row['is_starter'] == False) 
        else '90' if pd.isnull(row['minute_out']) else row['minute_out'],
        axis=1
    )
    player_games['minute_in'] = player_games['minute_in'].fillna('0')
    player_games['player'] = player_games['player'].fillna(player)
    player_games['is_starter'] = player_games['player'].fillna(False)
    player_games['minute_in'] = player_games['minute_in'].str.split('+').str[0].astype(float)
    player_games['minute_out'] = player_games['minute_out'].str.split('+').str[0].astype(float)
    player_games['minutes_played'] = player_games['minute_out'].astype(int) - player_games['minute_in'].astype(int)
    player_games['minutes_not_played'] = 90 - player_games['minutes_played']

    season_minutes_played = int(np.sum(player_games['minutes_played']))
    season_minutes_not_played = int(np.sum(player_games['minutes_not_played']))

    # Calcuating goals on pitch
    player_goals_for_minutes = goals_for.merge(player_games, on='game')
    player_goals_for_minutes['minute'] = player_goals_for_minutes['minute'].str.split('+').str[0].astype(int)

    player_goals_for_minutes['minute_in'] = player_goals_for_minutes['minute_in'].astype(int)
    player_goals_for_minutes['minute_out'] = player_goals_for_minutes['minute_out'].astype(int)

    player_goals_for_minutes['goal_on_pitch'] = player_goals_for_minutes.apply(
        lambda row: 1 if row['minute_in'] <= row['minute'] <= row['minute_out'] else 0,
        axis=1
    )

    player_goals_for_minutes['goal_not_on_pitch'] = player_goals_for_minutes.apply(
        lambda row: 1 if row['minute_in'] > row['minute'] or row['minute'] > row['minute_out'] else 0,
        axis=1
    )

    season_goals_on_pitch = int(np.sum(player_goals_for_minutes['goal_on_pitch']))
    season_goals_not_on_pitch = int(np.sum(player_goals_for_minutes['goal_not_on_pitch']))

    if season_minutes_played > 0:
        gpm_on_pitch = season_goals_on_pitch / season_minutes_played
    else:
        gpm_on_pitch = 0
    
    if season_minutes_not_played > 0:
        gpm_not_on_pitch = season_goals_not_on_pitch / season_minutes_not_played
    else:
        gpm_not_on_pitch

    if season_minutes_played > 0 and season_minutes_not_played > 0:
        std_on_pitch = np.sqrt(gpm_on_pitch * (1 - gpm_on_pitch) / season_minutes_played)
        std_not_on_pitch = np.sqrt(gpm_not_on_pitch * (1 - gpm_not_on_pitch) / season_minutes_not_played)

        t_statistic, p_value = stats.ttest_ind_from_stats(
            mean1=gpm_on_pitch, std1=std_on_pitch, nobs1=season_minutes_played,
            mean2=gpm_not_on_pitch, std2=std_not_on_pitch, nobs2=season_minutes_not_played,
            equal_var=False  # Use `False` if variances are not assumed to be equal
        )
    else:
        p_value = 0
    
    return {'player': player,
            'mp':season_minutes_played, 
            'mnp': season_minutes_not_played, 
            'gop': season_goals_on_pitch, 
            'gnop': season_goals_not_on_pitch, 
            'gpm_op': gpm_on_pitch, 
            'gpm_nop': gpm_not_on_pitch,
            'pvalue' : '{:f}'.format(float(p_value))}

In [136]:
full_dict = {'player':[],
            'mp':[], 
            'mnp': [], 
            'gop': [], 
            'gnop': [], 
            'gpm_op': [], 
            'gpm_nop': [],
            'pvalue' : []}
for player in players:
    res_dict = metrics_calculator(player)
    for key in res_dict.keys():
        full_dict[key].append(res_dict[key])

df = pd.DataFrame(full_dict)
df['score'] = df['gpm_op'] - df['gpm_nop']
df.sort_values(by='score', ascending=False).reset_index(drop=True)