In [1]:
from os import path
import pandas as pd

# Directory where data is stored
DATA_DIR = '../resources/code-soccer-files-main/data'

# player_id column is the index column
dfm = pd.read_csv(path.join(DATA_DIR, 'matches.csv'), index_col='match_id')

In [4]:
# filter by passing the index value
championship_id = 2058017
print(dfm.loc[championship_id])
print()

# multiple index values
group_a_ids = [2057959, 2057958, 2057957, 2057956, 2057955, 2057954]
print(dfm.loc[group_a_ids])
print()

# multiple index values, selected columns
print(dfm.loc[group_a_ids, ['label', 'group', 'venue']])
print()


label              France - Croatia, 4 - 2
group                                  NaN
date                   2018-07-15 15:00:00
venue         Olimpiyskiy stadion Luzhniki
dur                                Regular
gameweek                                 0
round_id                           4165368
home                                  4418
away                                  9598
winner                                4418
loser                                 9598
ref                                 378051
ref2                                378038
ref3                                378060
ref4                                377215
home_score                               4
away_score                               2
home_team                           France
away_team                          Croatia
day                                     31
Name: 2058017, dtype: object

                                  label    group                 date  \
match_id                             

In [7]:
# boolean indexing 

# column of booleans
is_group_b = dfm['group'] == 'Group B'
# plugging into loc
dfm_b = dfm.loc[is_group_b]
print(dfm_b[['label', 'group', 'venue']].head())
print()

# oneliner
dfm_g = dfm.loc[dfm['group'] == 'Group G']
print(dfm_g[['label', 'group', 'venue']].head())

                              label    group                         venue
match_id                                                                  
2057964      Iran - Portugal, 1 - 1  Group B                Mordovia Arena
2057965      Spain - Morocco, 2 - 2  Group B           Kaliningrad Stadium
2057962   Portugal - Morocco, 1 - 0  Group B  Olimpiyskiy stadion Luzhniki
2057963         Iran - Spain, 0 - 1  Group B                  Kazan' Arena
2057960     Portugal - Spain, 3 - 3  Group B     Olimpiyskiy Stadion Fisht

                             label    group                      venue
match_id                                                              
2057994   England - Belgium, 0 - 1  Group G        Kaliningrad Stadium
2057991   Tunisia - England, 1 - 2  Group G            Volgograd Arena
2057992   Belgium - Tunisia, 5 - 2  Group G            Otkrytiye Arena
2057995    Panama - Tunisia, 1 - 2  Group G             Mordovia Arena
2057990    Belgium - Panama, 3 - 0  Group G  Oli

In [11]:
# duplicates 
dfm.drop_duplicates(inplace=True)
# accessing selected columns after removing duplicate venues
print(dfm.drop_duplicates('venue')[['label', 'group', 'venue']])
print()
# check for duplicates
print(dfm.duplicated().head())
print()
# check specific column
print(dfm['group'].duplicated().head())

                                  label    group                         venue
match_id                                                                      
2058017         France - Croatia, 4 - 2      NaN  Olimpiyskiy stadion Luzhniki
2058012     Russia - Croatia, 2 - 2 (P)      NaN     Olimpiyskiy Stadion Fisht
2057977        Iceland - Croatia, 1 - 2  Group D                  Rostov Arena
2057974      Argentina - Croatia, 0 - 3  Group D       Stadion Nizhny Novgorod
2058014         France - Belgium, 1 - 0      NaN           Stadion Krestovskyi
2058011         Brazil - Belgium, 1 - 2      NaN                  Kazan' Arena
2057994        England - Belgium, 0 - 1  Group G           Kaliningrad Stadium
2057968            France - Peru, 1 - 0  Group C           Stadion Central'nyj
2058013         Sweden - England, 0 - 2      NaN                  Samara Arena
2058009   Colombia - England, 1 - 1 (P)      NaN               Otkrytiye Arena
2057991        Tunisia - England, 1 - 2  Group G    

In [12]:
import numpy as np

# Combining filtering with changing columns
dfm['home_away_desc'] = np.nan
dfm.loc[dfm['home'] == dfm['winner'], 'home_away_desc'] = 'home team won!'
dfm.loc[dfm['away'] == dfm['winner'], 'home_away_desc'] = 'away team won!'
dfm.loc[dfm['winner'] == 0, 'home_away_desc'] = 'tied!'

dfm['home_away_desc'].value_counts()

away team won!    27
home team won!    26
tied!              8
Name: home_away_desc, dtype: int64

In [13]:
# Query
print(dfm.query("group == 'Group A'").head())
print()
dfm['is_group_b'] = dfm['group'] == 'Group B'
print(dfm.query("is_group_b").head())
print()
print(dfm.query("group.isnull()")[['label', 'group', 'venue']].head())

                                  label    group                 date  \
match_id                                                                
2057956           Russia - Egypt, 3 - 1  Group A  2018-06-19 18:00:00   
2057959     Saudi Arabia - Egypt, 2 - 1  Group A  2018-06-25 14:00:00   
2057954    Russia - Saudi Arabia, 5 - 0  Group A  2018-06-14 15:00:00   
2057957   Uruguay - Saudi Arabia, 1 - 0  Group A  2018-06-20 15:00:00   
2057958         Uruguay - Russia, 3 - 0  Group A  2018-06-25 14:00:00   

                                 venue      dur  gameweek  round_id   home  \
match_id                                                                     
2057956            Stadion Krestovskyi  Regular         2   4165363  14358   
2057959                Volgograd Arena  Regular         3   4165363  16521   
2057954   Olimpiyskiy stadion Luzhniki  Regular         1   4165363  14358   
2057957                   Rostov Arena  Regular         2   4165363  15670   
2057958             