In [1]:
from os import path
import pandas as pd

# Directory where data is stored
DATA_DIR = '../resources/code-soccer-files-main/data'

# player-game
pg = pd.read_csv(path.join(DATA_DIR, 'player_match.csv'))  
# game info
games = pd.read_csv(path.join(DATA_DIR, 'matches.csv')) 
# player info
player = pd.read_csv(path.join(DATA_DIR, 'players.csv')) 

In [3]:
# merging two dataframes on a column
pd.merge(pg, player[['player_id', 'birth_date']], on='player_id').head(5)

Unnamed: 0,name,team,min,shot,goal,goal_allowed,assist,player_id,match_id,date,...,air_duel_won,gk_leave_line,gk_save_attempt,throw,corner,pos,side,player_rank,started,birth_date
0,D. Cheryshev,Russia,66.0,3,2,0,0,4513,2057954,20180614,...,2,0,0,0,0,MID,left,0.0405,False,19901226
1,D. Cheryshev,Russia,74.0,4,1,0,0,4513,2057956,20180619,...,2,0,0,3,1,MID,left,0.0162,True,19901226
2,D. Cheryshev,Russia,38.0,1,0,0,0,4513,2057958,20180625,...,0,0,0,1,1,MID,left,0.0064,True,19901226
3,D. Cheryshev,Russia,59.0,0,1,0,0,4513,2058004,20180701,...,0,0,0,0,0,MID,central,-0.0026,False,19901226
4,D. Cheryshev,Russia,66.0,2,1,0,0,4513,2058012,20180707,...,2,0,0,1,0,MID,left,-0.0063,True,19901226


In [4]:
# merging on multiple columns 
pass_df = pg[['match_id', 'player_id', 'pass', 'assist']]
shot_df = pg[['match_id', 'player_id', 'shot', 'goal']]

combined = pd.merge(pass_df, shot_df, on=['match_id', 'player_id'])
combined.head()

Unnamed: 0,match_id,player_id,pass,assist,shot,goal
0,2057954,4513,22,0,3,2
1,2057954,41123,26,0,0,0
2,2057954,101576,17,0,0,0
3,2057954,101583,26,0,0,0
4,2057954,101590,8,0,0,0


In [6]:
# 1:1, 1:many (or many:1), or many:many

print(player['player_id'].duplicated().any()) # player level
print(combined['player_id'].duplicated().any()) # player-game level

# 1:many 
pd.merge(combined, player[['player_id', 'player_name', 'pos', 'team']]).head()

False
True


Unnamed: 0,match_id,player_id,pass,assist,shot,goal,player_name,pos,team
0,2057954,4513,22,0,3,2,D. Cheryshev,MID,Russia
1,2057956,4513,28,0,4,1,D. Cheryshev,MID,Russia
2,2057958,4513,10,0,1,0,D. Cheryshev,MID,Russia
3,2058004,4513,6,0,0,1,D. Cheryshev,MID,Russia
4,2058012,4513,18,0,2,1,D. Cheryshev,MID,Russia


In [8]:
# unmatched observations
goal_df = pg.loc[pg['goal'] > 0, ['match_id', 'player_id', 'goal']]
assist_df = pg.loc[pg['assist'] > 0, ['match_id', 'player_id', 'assist']]

print(goal_df.shape)
print(assist_df.shape)

# players with at least one goal and assist
comb_inner = pd.merge(goal_df, assist_df)
print(comb_inner.shape)
print(comb_inner.head())


(159, 3)
(82, 3)
(10, 4)
   match_id  player_id  goal  assist
0   2057954     257800     1       2
1   2057977      69400     1       1
2   2057992      25776     1       1
3   2057995      25776     1       1
4   2057999      14836     1       1


In [10]:
# keep everything in the left table
comb_left = pd.merge(goal_df, assist_df, how='left')
print(comb_left.shape)
comb_left.head()

(159, 4)


Unnamed: 0,match_id,player_id,goal,assist
0,2057954,4513,2,
1,2057954,101669,1,
2,2057954,102157,1,
3,2057954,257800,1,2.0
4,2057955,227894,1,


In [11]:
# outer merge - keeps everything
comb_outer = pd.merge(goal_df, assist_df, how='outer', indicator=True)
print(comb_outer.shape)
print()

# indicator keyword adds a column _merge, tells where the observation was (left, right or both)
print(comb_outer['_merge'].value_counts())

(231, 5)

left_only     149
right_only     72
both           10
Name: _merge, dtype: int64


In [13]:
# query 
print(comb_outer.query("_merge == 'both'")['player_id'].value_counts())
print()

print(player.query("player_id == 25776")[['player_name', 'pos', 'team']])

25776     2
257800    1
69400     1
14836     1
20751     1
3682      1
8287      1
101590    1
14812     1
Name: player_id, dtype: int64

    player_name  pos     team
454   W. Khazri  MID  Tunisia


In [20]:
# left_on and right_on
goal_df = pg.loc[pg['goal'] > 0, ['match_id', 'player_id', 'goal']]
goal_df.columns = ['match_id', 'scorer_id', 'goal']

assist_df = pg.loc[pg['assist'] > 0, ['match_id', 'player_id', 'assist']]
assist_df.columns = ['match_id', 'passer_id', 'assist']

# different column names
ga_df = pd.merge(goal_df, assist_df, left_on=['match_id', 'scorer_id'],
         right_on=['match_id', 'passer_id'])
print(ga_df.head())
print()

# two columns with the same values
ga_df.drop('passer_id', axis=1, inplace=True)
ga_df.rename(columns={'scorer_id': 'player_id'}, inplace=True)

print(ga_df.head())

   match_id  scorer_id  goal  passer_id  assist
0   2057954     257800     1     257800       2
1   2057977      69400     1      69400       1
2   2057992      25776     1      25776       1
3   2057995      25776     1      25776       1
4   2057999      14836     1      14836       1

   match_id  player_id  goal  assist
0   2057954     257800     1       2
1   2057977      69400     1       1
2   2057992      25776     1       1
3   2057995      25776     1       1
4   2057999      14836     1       1


In [21]:
# merging on index

# each players maximum number of goals in a game
max_goals = (goal_df
               .groupby('scorer_id')
               .agg(max_goals = ('goal', 'max')))

print(max_goals.head())
print()

print(max_goals.value_counts(normalize=True))
print()
# merging back into original goal_df
print(pd.merge(goal_df, max_goals, left_on='scorer_id', right_index=True).head()) # one regular column, one index
# once merged, the indices are reset

           max_goals
scorer_id           
48                 1
122                1
123                1
261                1
3304               1

max_goals
1            0.905983
2            0.076923
3            0.017094
dtype: float64

      match_id  scorer_id  goal  max_goals
0      2057954       4513     2          2
56     2057956       4513     1          2
1290   2058004       4513     1          2
1503   2058012       4513     1          2
5      2057954     101669     1          1


In [23]:
# pd.concat for combining dataframes with the same index
goal_df = (pg.loc[pg['goal'] > 0, ['match_id', 'player_id', 'goal']]
           .set_index(['match_id', 'player_id']))

assist_df = (pg.loc[pg['assist'] > 0, ['match_id', 'player_id', 'assist']]
             .set_index(['match_id', 'player_id']))

# concating horizontaly
c_df = pd.concat([goal_df, assist_df], axis=1)
print(c_df.head())
print()

tackle_df = (pg.loc[pg['tackle'] > 0, ['match_id', 'player_id', 'tackle']]
           .set_index(['match_id', 'player_id']))

print(pd.concat([goal_df, assist_df, tackle_df], axis=1).head())


                    goal  assist
match_id player_id              
2057954  4513        2.0     NaN
         101669      1.0     NaN
         102157      1.0     NaN
         257800      1.0     2.0
2057955  227894      1.0     NaN

                    goal  assist  tackle
match_id player_id                      
2057954  4513        2.0     NaN     NaN
         101669      1.0     NaN     NaN
         102157      1.0     NaN     1.0
         257800      1.0     2.0     1.0
2057955  227894      1.0     NaN     NaN


In [24]:
# Concating Vertically - default

mids = pg.loc[pg['pos'] == 'MID']
fwds = pg.loc[pg['pos'] == 'FWD']

print(mids.shape)
print(fwds.shape)
print(pd.concat([mids, fwds]).shape)
print()

# reseting index for demonstration (0,1,2...)
mids_reset = mids.reset_index(drop=True)
fwds_reset = fwds.reset_index(drop=True)

# duplicates
print(pd.concat([mids_reset, fwds_reset]).sort_index().head())
print()

# ignores the indices
print(pd.concat([mids_reset, fwds_reset], ignore_index=True).sort_index().head())

(637, 32)
(384, 32)
(1021, 32)

           name    team   min  shot  goal  goal_allowed  assist  player_id  \
0  D. Cheryshev  Russia  66.0     3     2             0       0       4513   
0     A. Dzyuba  Russia  20.0     1     1             0       0     101669   
1     F. Smolov  Russia  70.0     0     0             0       0     101707   
1    A. Dzagoev  Russia  24.0     0     0             0       0     101590   
2    A. Samedov  Russia  64.0     2     0             0       0     101699   

   match_id      date  ...  air_duel  air_duel_won  gk_leave_line  \
0   2057954  20180614  ...         2             2              0   
0   2057954  20180614  ...         2             1              0   
1   2057954  20180614  ...         3             1              0   
1   2057954  20180614  ...         2             2              0   
2   2057954  20180614  ...         5             2              0   

   gk_save_attempt  throw  corner  pos     side  player_rank  started  
0           