<a href="https://colab.research.google.com/github/HimalKarkal/netball-analysis/blob/master/TrueSkill_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

This notebook attempts to rate SSN Netball players using the TrueSkill Rating System.

The logic used is as follows:

1. Enumerate filepaths for the season of interest.
2. For each unique filepath, read in the *substitutions* and *playerStats* files as dataframes.
3. Using the substitutions file, classify players to positions.
4. Add the player positions to the playerStats dataframe.
5. For each position, group by position and calculate the means of each statistic.
6. Each player's individual metrics are rated against the league average rating (always 1000+/-333.333) and stored in a list in a dictionary.
7. The following storage items are required: Dict(League average previous week and rating) & Dict(Player's ratings weekly)

# 1. Filepaths

This section creates a list of filepaths for the *substitutions* and *playerStats* files for the range of seasons in question.

In [1]:
# Cloning the data from GitHub

! git clone 'https://github.com/HimalKarkal/netball-analysis.git'
!pip install trueskill

fatal: destination path 'netball-analysis' already exists and is not an empty directory.


In [2]:
# Importing necessary modules

import glob
import numpy as np
import pandas as pd
from trueskill import Rating, rate_1vs1

In [3]:
# Creating dictionaries of filepaths for substitutions and playerStats for each round in a season and tournament.

season = "2020" # Use this to change the year
tournament = "SSN" # Use this to change the tournament

list_subs = glob.glob("/content/netball-analysis/data/matchCentre/processed/*/" + "*substitutions*" + season + "*" + tournament + "*.csv")
list_playerStats = glob.glob("/content/netball-analysis/data/matchCentre/processed/*/" + "*playerStats*" + season + "*" + tournament + "*.csv")

list_subs.sort()
list_playerStats.sort()

dict_subs = {}
dict_playerStats = {}

i = 1
while i <= 14:
  dict_subs[i] = list_subs[4 * (i-1): 4 * i]
  dict_playerStats[i] = list_playerStats[4 * (i-1): 4 * i]

  i += 1

In [4]:
dict_player_ratings = {'attempt1':{},
                        'attempt2':{},
                        'attempt_from_zone1':{},
                        'attempt_from_zone2':{},
                        'badHands':{},
                        'badPasses':{},
                        'blocked':{},
                        'blocks':{},
                        'breaks':{},
                        'centrePassReceives':{},
                        'centrePassToGoalPerc':{},
                        'centrePassToGoalPerc':{},
                        'contactPenalties':{},
                        'deflectionPossessionGain':{},
                        'deflectionWithGain':{},
                        'deflectionWithNoGain':{},
                        'deflections':{},
                        'disposals':{},
                        'feedWithAttempt':{},
                        'feeds':{},
                        'gain':{},
                        'gainToGoalPerc':{},
                        'generalPlayTurnovers':{},
                        'goal1':{},
                        'goal2':{},
                        'goalAssists':{},
                        'goalAttempts':{},
                        'goalMisses':{},
                        'goal_from_zone1':{},
                        'goal_from_zone2':{},
                        'goals':{},
                        'interceptPassThrown':{},
                        'intercepts':{},
                        'obstructionPenalties':{},
                        'offsides':{},
                        'passes':{},
                        'penalties':{},
                        'pickups':{}}

In [5]:
# Final Code
'''
Accepts: Dictionaries containing filepaths (dict_subs and dict_playerStats)
Returns: Concatenated dataframes for each round in the season. (df_subs and df_playerStats)
'''
for week in range(1,15):

  df_subs = pd.DataFrame()
  df_playerStats = pd.DataFrame()

  for game in range(0,4):
    subs_temp = pd.read_csv(dict_subs[week][game])
    playerStats_temp = pd.read_csv(dict_playerStats[week][game])

    df_subs = pd.concat([df_subs,subs_temp])
    df_playerStats = pd.concat([df_playerStats,playerStats_temp])

  del subs_temp, playerStats_temp

  '''
  Accepts: Substitutions dataframe for each week (df_subs)
  Returns: Classification of each player to a position (df_temp['Position'])
  Comments: Players who spent the entire time as a substitutes are excluded
  '''
  dict_temp = {'GS':{},
  'GA':{},
  'WA': {},
  'C': {},
  'WD': {},
  'GD': {},
  'GK': {},
  'S': {}}

  for i, row in df_subs.iterrows():
    player = row['playerId']
    position = row['startingPos']
    duration = row['duration']

    if player in dict_temp[position]:
      dict_temp[position][player] += duration
    else:
      dict_temp[position][player] = duration

  df_timeInPosition = pd.DataFrame(dict_temp).fillna(0).reset_index()
  df_timeInPosition = df_timeInPosition.rename(columns={'index': 'playerId'})
  df_timeInPosition = df_timeInPosition.loc[df_timeInPosition['S'] != 3600]
  df_timeInPosition['Position'] = df_timeInPosition.drop(columns = ['playerId', 'S']).idxmax(axis = 1)
  df_timeInPosition.head()

  #Selecting appropriate columns only from df_playerStats

  df_playerStats = df_playerStats[['matchId', 'squadId', 'oppSquadId', 'playerId', 'attempt1', 'attempt2',
        'attempt_from_zone1', 'attempt_from_zone2', 'badHands', 'badPasses',
        'blocked', 'blocks', 'breaks', 'centrePassReceives',
        'centrePassToGoalPerc', 'contactPenalties', 'deflectionPossessionGain',
        'deflectionWithGain', 'deflectionWithNoGain', 'deflections',
        'disposals', 'feedWithAttempt', 'feeds', 'gain', 'gainToGoalPerc',
        'generalPlayTurnovers', 'goal1', 'goal2', 'goalAssists', 'goalAttempts',
        'goalMisses', 'goal_from_zone1', 'goal_from_zone2', 'goals',
        'interceptPassThrown', 'intercepts', 'minutesPlayed',
        'missedGoalTurnover', 'netPoints', 'obstructionPenalties', 'offsides',
        'passes', 'penalties', 'pickups', 'points', 'possessionChanges',
        'possessions', 'quartersPlayed', 'rebounds', 'tossUpWin']]

  df_playerStats = df_playerStats.merge(df_timeInPosition[['playerId', 'Position']], on = 'playerId', how = 'left')
  df_playerStats = df_playerStats.dropna()

  # Calculating Means

  df_means = pd.DataFrame()

  list_positions = ['GS', 'GA', 'WA', 'C', 'WD', 'GD', 'GK']

  for position in list_positions:
    df_temp = df_playerStats.loc[df_playerStats['Position'] == position]
    df_temp = df_temp.drop(columns = ['playerId', 'Position', 'matchId', 'squadId', 'oppSquadId'])
    mean = np.round((df_temp.mean()), 2)
    mean['Position'] = position
    df_means = pd.concat([df_means, mean.to_frame().T]).reset_index(drop = True)

  # Rating players

  league_average_rating = Rating(mu = 1000, sigma = 333.333)

  for i, player in df_playerStats.iterrows():
    position = df_playerStats.loc[i, 'Position']
    row_means = df_means.loc[df_means['Position'] == position]

    # Positive statistics


    for statistic in ['attempt1', 'attempt2', 'attempt_from_zone1', 'attempt_from_zone2',
                      'gain', 'gainToGoalPerc','generalPlayTurnovers', 'goal1', 'goal2',
                      'goalAssists', 'goalAttempts', 'goal_from_zone1', 'goal_from_zone2',
                      'goals', 'interceptPassThrown', 'intercepts']: # Check and add other positive statistics

      # Accessing player rating for statistic from dictionary or creating if non-existent

      if player['playerId'] in dict_player_ratings[statistic]:
        player_rating = dict_player_ratings[statistic][player['playerId']]
      else:
        player_rating = Rating(mu = 1000, sigma = 333.333)

      # Rating players

      if player[statistic] > row_means.iloc[0][statistic]:
        player_rating,_ = rate_1vs1(player_rating, league_average_rating)

      elif player[statistic] < row_means.iloc[0][statistic]:
        player_rating,_ = rate_1vs1(player_rating, league_average_rating)

      else:
        player_rating,_ =  rate_1vs1(player_rating, league_average_rating, drawn = True)

      dict_player_ratings[statistic][player['playerId']] = player_rating


      # Negative statistics


    for statistic in ['blocked', 'badHands', 'badPasses', 'goalMisses']: #Add other negative statistics

      # Accessing player rating for statistic from dictionary or creating if non-existent

      if player['playerId'] in dict_player_ratings[statistic]:
        player_rating = dict_player_ratings[statistic][player['playerId']]
      else:
        player_rating = Rating(mu = 1000, sigma = 333.333)

      # Rating players

      if player[statistic] < row_means.iloc[0][statistic]:
        player_rating,_ = rate_1vs1(player_rating, league_average_rating)

      elif player[statistic] > row_means.iloc[0][statistic]:
        player_rating,_ = rate_1vs1(player_rating, league_average_rating)

      else:
        player_rating,_ =  rate_1vs1(player_rating, league_average_rating, drawn = True)

      dict_player_ratings[statistic][player['playerId']] = player_rating

In [6]:
dict_player_ratings

{'attempt1': {80078: trueskill.Rating(mu=1555.913, sigma=149.652),
  991905: trueskill.Rating(mu=1000.000, sigma=86.078),
  1000831: trueskill.Rating(mu=1000.000, sigma=86.078),
  1004472: trueskill.Rating(mu=1069.856, sigma=97.712),
  1005495: trueskill.Rating(mu=1067.958, sigma=117.631),
  1006231: trueskill.Rating(mu=1000.000, sigma=86.078),
  1007219: trueskill.Rating(mu=1000.000, sigma=86.078),
  1007306: trueskill.Rating(mu=1555.913, sigma=149.652),
  1009721: trueskill.Rating(mu=1484.699, sigma=174.304),
  1015274: trueskill.Rating(mu=1076.681, sigma=102.344),
  1019380: trueskill.Rating(mu=1000.000, sigma=235.718),
  1021092: trueskill.Rating(mu=1445.142, sigma=188.111),
  80105: trueskill.Rating(mu=1000.000, sigma=86.078),
  80296: trueskill.Rating(mu=1106.125, sigma=93.738),
  80297: trueskill.Rating(mu=1000.000, sigma=92.463),
  80475: trueskill.Rating(mu=1555.913, sigma=149.652),
  80559: trueskill.Rating(mu=1000.000, sigma=86.078),
  997815: trueskill.Rating(mu=1077.496, s

# The following sections were created independently and then stiched together in the block above

#3. Classifying to Positions

NOTE: This will have to be included under the previous for-loop in the final implementation.

In [None]:
'''
Accepts: Substitutions dataframe for each week (df_subs)
Returns: Classification of each player to a position (df_temp['Position'])
Comments: Players who spent the entire time as a substitutes are excluded
'''
dict_temp = {'GS':{},
'GA':{},
'WA': {},
'C': {},
'WD': {},
'GD': {},
'GK': {},
'S': {}}

for i, row in df_subs.iterrows():
  player = row['playerId']
  position = row['startingPos']
  duration = row['duration']

  if player in dict_temp[position]:
    dict_temp[position][player] += duration
  else:
    dict_temp[position][player] = duration

df_timeInPosition = pd.DataFrame(dict_temp).fillna(0).reset_index()
df_timeInPosition = df_timeInPosition.rename(columns={'index': 'playerId'})
df_timeInPosition = df_timeInPosition.loc[df_timeInPosition['S'] != 3600]
df_timeInPosition['Position'] = df_timeInPosition.drop(columns = ['playerId', 'S']).idxmax(axis = 1)
df_timeInPosition.head()

Unnamed: 0,playerId,GS,GA,WA,C,WD,GD,GK,S,Position
0,80078,3600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,GS
1,1010545,3600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,GS
2,1001944,3600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,GS
3,80150,3600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,GS
4,1001865,3600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,GS


#4. Adding Positions to playerStats Dataframe
NOTE: This will have to be included under the previous for-loop in the final implementation.

In [None]:
#Selecting appropriate columns only

df_playerStats = df_playerStats[['matchId', 'squadId', 'oppSquadId', 'playerId', 'attempt1', 'attempt2',
       'attempt_from_zone1', 'attempt_from_zone2', 'badHands', 'badPasses',
       'blocked', 'blocks', 'breaks', 'centrePassReceives',
       'centrePassToGoalPerc', 'contactPenalties', 'deflectionPossessionGain',
       'deflectionWithGain', 'deflectionWithNoGain', 'deflections',
       'disposals', 'feedWithAttempt', 'feeds', 'gain', 'gainToGoalPerc',
       'generalPlayTurnovers', 'goal1', 'goal2', 'goalAssists', 'goalAttempts',
       'goalMisses', 'goal_from_zone1', 'goal_from_zone2', 'goals',
       'interceptPassThrown', 'intercepts', 'minutesPlayed',
       'missedGoalTurnover', 'netPoints', 'obstructionPenalties', 'offsides',
       'passes', 'penalties', 'pickups', 'points', 'possessionChanges',
       'possessions', 'quartersPlayed', 'rebounds', 'tossUpWin']]

df_playerStats = df_playerStats.merge(df_timeInPosition[['playerId', 'Position']], on = 'playerId', how = 'left')
df_playerStats = df_playerStats.dropna()

#5. Calculating Means
NOTE: This goes into for loop in final implementation

In [None]:
import numpy as np

In [None]:
df_means = pd.DataFrame()

list_positions = ['GS', 'GA', 'WA', 'C', 'WD', 'GD', 'GK']

for position in list_positions:
  df_temp = df_playerStats.loc[df_playerStats['Position'] == position]
  df_temp = df_temp.drop(columns = ['playerId', 'Position', 'matchId', 'squadId', 'oppSquadId'])
  mean = np.round((df_temp.mean()), 2)
  mean['Position'] = position
  df_means = pd.concat([df_means, mean.to_frame().T]).reset_index(drop = True)

#6. Rating
NOTE: This will have to be included under the previous for-loop in the final implementation.

In [36]:
!pip install trueskill

from trueskill import Rating, rate_1vs1



In [44]:
# Rating players
dict_player_ratings = {'attempt1':{},
                       'attempt2':{},
                       'attempt_from_zone1':{},
                       'attempt_from_zone2':{},
                       'badHands':{},
                       'badPasses':{},
                       'blocked':{},
                       'blocks':{},
                       'breaks':{},
                       'centrePassReceives':{},
                       'centrePassToGoalPerc':{},
                       'centrePassToGoalPerc':{},
                       'contactPenalties':{},
                       'deflectionPossessionGain':{},
                       'deflectionWithGain':{},
                       'deflectionWithNoGain':{},
                       'deflections':{},
                       'disposals':{},
                       'feedWithAttempt':{},
                       'feeds':{},
                       'gain':{},
                       'gainToGoalPerc':{},
                       'generalPlayTurnovers':{},
                       'goal1':{},
                       'goal2':{},
                       'goalAssists':{},
                       'goalAttempts':{},
                       'goalMisses':{},
                       'goal_from_zone1':{},
                       'goal_from_zone2':{},
                       'goals':{},
                       'interceptPassThrown':{},
                       'intercepts':{},
                       'obstructionPenalties':{},
                       'offsides':{},
                       'passes':{},
                       'penalties':{},
                       'pickups':{}}

league_average_rating = Rating(mu = 1000, sigma = 333.333)

for i, player in df_playerStats.iterrows():
  position = df_playerStats.loc[i, 'Position']
  row_means = df_means.loc[df_means['Position'] == position]

  # Positive statistics


  for statistic in ['attempt1', 'attempt2', 'attempt_from_zone1', 'attempt_from_zone2',
                    'gain', 'gainToGoalPerc','generalPlayTurnovers', 'goal1', 'goal2',
                    'goalAssists', 'goalAttempts', 'goal_from_zone1', 'goal_from_zone2',
                    'goals', 'interceptPassThrown', 'intercepts']: # Check and add other positive statistics

    # Accessing player rating for statistic from dictionary or creating if non-existent

    if player['playerId'] in dict_player_ratings[statistic]:
      player_rating = dict_player_ratings[statistic][player['playerId']]
    else:
      player_rating = Rating(mu = 1000, sigma = 333.333)

    # Rating players

    if player[statistic] > row_means.iloc[0][statistic]:
      player_rating,_ = rate_1vs1(player_rating, league_average_rating)

    elif player[statistic] < row_means.iloc[0][statistic]:
      player_rating,_ = rate_1vs1(player_rating, league_average_rating)

    else:
      player_rating,_ =  rate_1vs1(player_rating, league_average_rating, drawn = True)

    dict_player_ratings[statistic][player['playerId']] = player_rating


    # Negative statistics


  for statistic in ['blocked', 'badHands', 'badPasses', 'goalMisses']: #Add other negative statistics

    # Accessing player rating for statistic from dictionary or creating if non-existent

    if player['playerId'] in dict_player_ratings[statistic]:
      player_rating = dict_player_ratings[statistic][player['playerId']]
    else:
      player_rating = Rating(mu = 1000, sigma = 333.333)

    # Rating players

    if player[statistic] < row_means.iloc[0][statistic]:
      player_rating,_ = rate_1vs1(player_rating, league_average_rating)

    elif player[statistic] > row_means.iloc[0][statistic]:
      player_rating,_ = rate_1vs1(player_rating, league_average_rating)

    else:
      player_rating,_ =  rate_1vs1(player_rating, league_average_rating, drawn = True)

    dict_player_ratings[statistic][player['playerId']] = player_rating





In [45]:
dict_player_ratings['goals']

{80078: trueskill.Rating(mu=1188.284, sigma=275.190),
 991905: trueskill.Rating(mu=1000.000, sigma=235.718),
 1000831: trueskill.Rating(mu=1000.000, sigma=235.718),
 1004472: trueskill.Rating(mu=1000.000, sigma=235.718),
 1005495: trueskill.Rating(mu=1000.000, sigma=235.718),
 1006231: trueskill.Rating(mu=1000.000, sigma=235.718),
 1007219: trueskill.Rating(mu=1000.000, sigma=235.718),
 1007220: trueskill.Rating(mu=1000.000, sigma=235.718),
 1007306: trueskill.Rating(mu=1188.284, sigma=275.190),
 80010: trueskill.Rating(mu=1000.000, sigma=235.718),
 1004473: trueskill.Rating(mu=1000.000, sigma=235.718),
 1007301: trueskill.Rating(mu=1000.000, sigma=235.718),
 1010545: trueskill.Rating(mu=1188.284, sigma=275.190),
 1011747: trueskill.Rating(mu=1000.000, sigma=235.718),
 1013050: trueskill.Rating(mu=1000.000, sigma=235.718),
 1014126: trueskill.Rating(mu=1188.284, sigma=275.190),
 1021262: trueskill.Rating(mu=1000.000, sigma=235.718),
 80439: trueskill.Rating(mu=1000.000, sigma=235.718),