In [1]:
from os import path
import pandas as pd

# Directory where data is stored
DATA_DIR = '../resources/code-soccer-files-main/data'

# reading shot data
shots = pd.read_csv(path.join(DATA_DIR, 'shots.csv'))  # shot data


In [2]:
# summing data from one game
shots['attempt'] = 1
sum_cols = ['goal', 'attempt', 'accurate', 'counter', 'opportunity']
shots.groupby('match_id').sum()[sum_cols].head()

  shots.groupby('match_id').sum()[sum_cols].head()


Unnamed: 0_level_0,goal,attempt,accurate,counter,opportunity
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2057954,4,18,6,0,10
2057955,1,18,7,1,14
2057956,2,24,4,0,17
2057957,1,20,6,3,14
2057958,1,15,6,1,11


In [3]:
# agg methods performs the same functionality
shots.groupby('match_id').agg({
    'goal': 'sum',
    'attempt': 'count',
    'dist_m': 'mean',
    'dist_ft': 'mean'}).head()

Unnamed: 0_level_0,goal,attempt,dist_m,dist_ft
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2057954,4,18,16.935885,55.549704
2057955,1,18,19.086773,62.604615
2057956,2,24,18.302238,60.03134
2057957,1,20,16.187288,53.094305
2057958,1,15,20.81436,68.271101


In [4]:
# rename columns with tuples
shots.groupby('match_id').agg(
    goal = ('goal', 'sum'),
    attempt = ('attempt', 'count'),
    ave_dist_m = ('dist_m', 'mean'),
    ave_dist_ft = ('dist_ft', 'mean')).head()

Unnamed: 0_level_0,goal,attempt,ave_dist_m,ave_dist_ft
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2057954,4,18,16.935885,55.549704
2057955,1,18,19.086773,62.604615
2057956,2,24,18.302238,60.03134
2057957,1,20,16.187288,53.094305
2057958,1,15,20.81436,68.271101


In [5]:
# grouping by multiple variables - MULTILEVEL INDEXING
shots_team = shots.groupby(['match_id', 'team_id']).agg(
    goal = ('goal', 'sum'),
    attempt = ('attempt', 'count'),
    ave_dist_m = ('dist_m', 'mean'),
    min_dist_m = ('dist_m', 'min'),
    max_dist_ft = ('dist_ft', 'max'))

shots_team.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,goal,attempt,ave_dist_m,min_dist_m,max_dist_ft
match_id,team_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2057954,14358,4,11,15.320402,6.623716,108.628943
2057954,16521,0,7,19.474503,9.509131,113.86797
2057955,15670,1,11,17.965852,8.356085,110.011571
2057955,16129,0,7,20.848221,11.26675,96.462566
2057956,14358,2,12,20.53672,6.206842,107.08812


In [6]:
# loc method with multilevel indexing -> tuples
shots_team.loc[[(2057954, 14358), (2058017, 4418)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,goal,attempt,ave_dist_m,min_dist_m,max_dist_ft
match_id,team_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2057954,14358,4,11,15.320402,6.623716,108.628943
2058017,4418,2,7,19.859032,10.681969,90.257872


In [8]:
# stacking and unstacking 
fd = shots.query("foot in ('left', 'right')").groupby(
    ['name', 'foot'])['dist_m'].mean().reset_index()
print(fd.head())
print()

# player and foot level -> player level
fd_reshaped = fd.set_index(['name', 'foot']).unstack()
print(fd_reshaped.head())

          name   foot     dist_m
0     A. Badri   left  30.932028
1  A. Carrillo   left  16.712171
2  A. Carrillo  right  27.723482
3    A. Cooper  right  26.696061
4    A. Dzyuba   left  16.191779

                dist_m           
foot              left      right
name                             
A. Badri     30.932028        NaN
A. Carrillo  16.712171  27.723482
A. Cooper          NaN  26.696061
A. Dzyuba    16.191779  11.133510
A. Ekdal           NaN  19.647886


In [13]:
fd_reshaped.columns = ['left', 'right']
# average distance between left and right-footed shots
print(f"average difference: {(fd_reshaped['right'] - fd_reshaped['left']).mean()}")
print()

# how many players had further average shots with their right vs left foot
print(fd_reshaped.idxmax(axis=1).value_counts())
print()

# undo the unstacking operation
fd_reshaped_undo = fd_reshaped.stack()
print(fd_reshaped_undo.head())

average difference: 0.09521678124840466

right    197
left     143
dtype: int64

name              
A. Badri     left     30.932028
A. Carrillo  left     16.712171
             right    27.723482
A. Cooper    right    26.696061
A. Dzyuba    left     16.191779
dtype: float64
