In [2]:
from os import path
import pandas as pd

# Directory where data is stored
DATA_DIR = '../resources/code-soccer-files-main/data'

# reading the player match file
pg = pd.read_csv(path.join(DATA_DIR, 'player_match.csv'))
print(f'columns: {pg.columns}')

columns: Index(['name', 'team', 'min', 'shot', 'goal', 'goal_allowed', 'assist',
       'player_id', 'match_id', 'date', 'pass', 'pass_accurate', 'tackle',
       'accel', 'counter', 'opportunity', 'keypass', 'own_goal',
       'interception', 'smart', 'clearance', 'cross', 'air_duel',
       'air_duel_won', 'gk_leave_line', 'gk_save_attempt', 'throw', 'corner',
       'pos', 'side', 'player_rank', 'started'],
      dtype='object')


In [3]:
# creating and modifying columns
pg['yellow_cards'] = 1
pg[['name', 'min', 'yellow_cards']].head()

pg['yellow_cards'] = 2
pg[['name', 'min', 'yellow_cards']].head()

Unnamed: 0,name,min,yellow_cards
0,D. Cheryshev,66.0,2
1,Mário Fernandes,90.0,2
2,I. Akinfeev,,2
3,S. Ignashevich,90.0,2
4,A. Dzagoev,24.0,2


In [4]:
import numpy as np

# math and number columns
pg['shot_pct'] = 100*pg['goal']/pg['shot']
pg['biggest_impact'] = np.abs(pg['player_rank'])
pg['ln_pass'] = np.log(pg['pass'])
pg['goal_width_ft'] = 24

pg[['name', 'shot', 'goal', 'shot_pct', 'biggest_impact', 'ln_pass','goal_width_ft']].sample(5)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,name,shot,goal,shot_pct,biggest_impact,ln_pass,goal_width_ft
1552,P. Jansson,0,0,,0.0008,0.0,24
713,S. Milinković-Savić,2,0,0.0,0.0006,3.988984,24
1606,M. Badelj,0,0,,,-inf,24
967,Y. Benalouane,0,0,,0.0037,2.70805,24
1455,H. Lloris,0,0,,,2.772589,24


In [5]:
# boolean columns
pg['is_defender'] = (pg['pos'] == 'DEF')
print(pg[['name', 'team', 'is_defender']].sample(5))

pg['is_a_mid_or_fwd'] = (pg['pos'] == 'MID') | (pg['pos'] == 'FWD')
pg['balanced_off'] = (pg['goal'] > 0) & (pg['assist'] > 0)
pg['not_fr_or_eng'] = ~((pg['team'] == 'England') | (pg['team'] == 'France'))

print()
print((pg[['goal', 'assist']] > 0).sample(5))

              name       team  is_defender
989      K. Walker    England         True
475      S. Agüero  Argentina        False
195   K. El Ahmadi    Morocco        False
1147       T. Inui      Japan        False
170   Sergio Ramos      Spain         True

       goal  assist
1345  False   False
654   False   False
1322  False    True
999   False   False
1181  False   False


In [6]:
# Applying functions to columns
def is_south_america(team):
  return team in ['Brazil', 'Uruguay', 'Colombia', 'Argentina', 'Costa Rica',
                  'Peru']

pg['is_sa'] = pg['team'].apply(is_south_america)
print(pg[['name', 'team', 'is_sa']].sample(5))

# lambda function
pg['is_sa_alternate'] = pg['team'].apply(lambda x: x in [
    ['Brazil', 'Uruguay', 'Colombia', 'Argentina', 'Costa Rica', 'Peru']])

                    name          team  is_sa
1402          X. Shaqiri   Switzerland  False
469        J. Mascherano     Argentina   True
107   Mohammed Al Burayk  Saudi Arabia  False
590             Paulinho        Brazil   True
98      Salem Al Dawsari  Saudi Arabia  False


In [7]:
# Dropping Columns
pg.drop('is_sa_alternate', axis=1, inplace=True)

# Renaming Columns
pg.columns = [x.upper() for x in pg.columns]
pg.columns = [x.lower() for x in pg.columns]
pg.rename(columns={'min': 'minutes'}, inplace=True)

pg.head()

Unnamed: 0,name,team,minutes,shot,goal,goal_allowed,assist,player_id,match_id,date,...,yellow_cards,shot_pct,biggest_impact,ln_pass,goal_width_ft,is_defender,is_a_mid_or_fwd,balanced_off,not_fr_or_eng,is_sa
0,D. Cheryshev,Russia,66.0,3,2,0,0,4513,2057954,20180614,...,2,66.666667,0.0405,3.091042,24,False,True,False,True,False
1,Mário Fernandes,Russia,90.0,0,0,0,0,41123,2057954,20180614,...,2,,0.0001,3.258097,24,True,False,False,True,False
2,I. Akinfeev,Russia,,0,0,0,0,101576,2057954,20180614,...,2,,,2.833213,24,False,False,False,True,False
3,S. Ignashevich,Russia,90.0,0,0,0,0,101583,2057954,20180614,...,2,,0.0166,3.258097,24,True,False,False,True,False
4,A. Dzagoev,Russia,24.0,0,0,0,0,101590,2057954,20180614,...,2,,0.0039,2.079442,24,False,True,False,True,False


In [8]:
# missing data
pg['shot_pct'] = pg['goal']/pg['shot']
pg[['name', 'team', 'goal', 'shot', 'shot_pct']].head(10)

print(pg['shot_pct'].isnull().head(10))
print()
print(pg['shot_pct'].notnull().head(10))
print()
# replace missing values with -99
print(pg['shot_pct'].fillna(-99).head(10))

0    False
1     True
2     True
3     True
4     True
5    False
6    False
7     True
8    False
9    False
Name: shot_pct, dtype: bool

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7    False
8     True
9     True
Name: shot_pct, dtype: bool

0     0.666667
1   -99.000000
2   -99.000000
3   -99.000000
4   -99.000000
5     1.000000
6     0.000000
7   -99.000000
8     0.000000
9     1.000000
Name: shot_pct, dtype: float64


In [9]:
# Changing column types
pg['date'].sample(5)

# date = '20180618'
# year = date[0:4]
# month = date[4:6]
# day = date[6:8]

pg['month'] = pg['date'].astype(str).str[4:6]
print(pg[['name', 'team', 'month', 'date']].sample(5))
print()
print(pg['month'].astype(int).sample(5))
print()
print(pg.dtypes.sample(5))

              name     team month      date
691   João Miranda   Brazil    06  20180627
1352        Neymar   Brazil    07  20180702
110   R. Bentancur  Uruguay    06  20180620
1114      K. Honda    Japan    06  20180619
771        M. Berg   Sweden    06  20180618

1563    7
1373    7
358     6
373     6
994     6
Name: month, dtype: int32

shot_pct        float64
is_sa              bool
goal_allowed      int64
month            object
balanced_off       bool
dtype: object


In [11]:
pg = pd.read_csv(path.join(DATA_DIR, 'player_match.csv'))
pg[['match_id', 'player_id', 'date']] = (
    pg[['match_id', 'player_id', 'date']].astype(str))

# axis 
print(pg[['shot', 'goal', 'assist', 'pass']].mean(axis=0))
print(pg[['shot', 'goal', 'assist', 'pass']].mean(axis=1).head())



shot       0.817475
goal       0.104728
assist     0.050269
pass      31.599641
dtype: float64
0    6.75
1    6.50
2    4.25
3    6.50
4    2.00
dtype: float64


In [18]:
# summary functions 

pg['defender_scored'] = (pg['pos'] == 'DEF') & (pg['goal'] > 0)
# 0 for False, 1 for True

# percentage of defenders that scored
print(f'{np.round(pg["defender_scored"].mean() * 100, 2)}%')

# anyone had more than 100 passes
print((pg['pass'] > 100).any())

# everyone had at least one pass
print((pg['pass'] > 0).all())

# check by row if each player won more than 5 air duels or interceptions
print((pg[['air_duel_won', 'interception']] > 5).any(axis=1))

# how many cases? 
print((pg[['air_duel_won', 'interception']] > 5).any(axis=1).sum())

2.09%
True
False
0       False
1        True
2       False
3        True
4       False
        ...  
1666    False
1667    False
1668     True
1669    False
1670    False
Length: 1671, dtype: bool
332


In [20]:
# frequency of individual values
print(pg['team'].value_counts().head())
print()

# normalized -> for proportions
print(pg['team'].value_counts(normalize=True).head())
print()

# frequencies for all the combinations of the two columns
print(pd.crosstab(pg['team'], pg['pos']).head()) # col = unique team values, row = unique pos values

Croatia    100
England     99
Belgium     94
France      83
Russia      72
Name: team, dtype: int64

Croatia    0.059844
England    0.059246
Belgium    0.056254
France     0.049671
Russia     0.043088
Name: team, dtype: float64

pos         DEF  FWD  GKP  MID
team                          
Argentina    16   16    4   17
Belgium      28   26    7   33
Brazil       22   19    5   24
Colombia     17   10    4   25
Costa Rica   17    6    3   16
