### Data Cleaning & Preprocessing

Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Load data

In [2]:
df_data = pd.read_csv("../dataset/data.csv")
df_data.head()

Unnamed: 0,date,home,away,home_goal,away_goal,home_shot,away_shot,home_shot_on_goal,away_shot_on_goal,home_possesion_pct,away_possesion_pct,home_corner,away_corner
0,2013-01-01,West Ham United,Norwich City,2,1,16,8,5,2,0.5,0.5,11,3
1,2013-01-01,Tottenham Hotspur,Reading,3,1,34,7,11,3,0.68,0.32,12,3
2,2013-01-01,Southampton,Arsenal,1,1,9,6,5,1,0.44,0.56,3,3
3,2013-01-01,Swansea City,Aston Villa,2,2,25,9,6,5,0.69,0.31,3,2
4,2013-01-01,Manchester City,Stoke City,3,0,20,5,6,1,0.62,0.38,7,2


In [3]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3639 entries, 0 to 3638
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   date                3639 non-null   object 
 1   home                3639 non-null   object 
 2   away                3639 non-null   object 
 3   home_goal           3639 non-null   int64  
 4   away_goal           3639 non-null   int64  
 5   home_shot           3639 non-null   int64  
 6   away_shot           3639 non-null   int64  
 7   home_shot_on_goal   3639 non-null   int64  
 8   away_shot_on_goal   3639 non-null   int64  
 9   home_possesion_pct  3639 non-null   float64
 10  away_possesion_pct  3639 non-null   float64
 11  home_corner         3639 non-null   int64  
 12  away_corner         3639 non-null   int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 369.7+ KB


Get missing values

In [4]:
df_data.isnull().sum()

date                  0
home                  0
away                  0
home_goal             0
away_goal             0
home_shot             0
away_shot             0
home_shot_on_goal     0
away_shot_on_goal     0
home_possesion_pct    0
away_possesion_pct    0
home_corner           0
away_corner           0
dtype: int64

Remove data duplicates

In [5]:
df_data.drop_duplicates()
df_data.head()

Unnamed: 0,date,home,away,home_goal,away_goal,home_shot,away_shot,home_shot_on_goal,away_shot_on_goal,home_possesion_pct,away_possesion_pct,home_corner,away_corner
0,2013-01-01,West Ham United,Norwich City,2,1,16,8,5,2,0.5,0.5,11,3
1,2013-01-01,Tottenham Hotspur,Reading,3,1,34,7,11,3,0.68,0.32,12,3
2,2013-01-01,Southampton,Arsenal,1,1,9,6,5,1,0.44,0.56,3,3
3,2013-01-01,Swansea City,Aston Villa,2,2,25,9,6,5,0.69,0.31,3,2
4,2013-01-01,Manchester City,Stoke City,3,0,20,5,6,1,0.62,0.38,7,2


In [6]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3639 entries, 0 to 3638
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   date                3639 non-null   object 
 1   home                3639 non-null   object 
 2   away                3639 non-null   object 
 3   home_goal           3639 non-null   int64  
 4   away_goal           3639 non-null   int64  
 5   home_shot           3639 non-null   int64  
 6   away_shot           3639 non-null   int64  
 7   home_shot_on_goal   3639 non-null   int64  
 8   away_shot_on_goal   3639 non-null   int64  
 9   home_possesion_pct  3639 non-null   float64
 10  away_possesion_pct  3639 non-null   float64
 11  home_corner         3639 non-null   int64  
 12  away_corner         3639 non-null   int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 369.7+ KB


Add result column to the dataframe (3: home, 1: draw, 0: away)

In [7]:
df_data['result'] = 1
df_data.loc[df_data['home_goal'] > df_data['away_goal'], 'result'] = 3
df_data.loc[df_data['home_goal'] < df_data['away_goal'], 'result'] = 0
df_data.head()

Unnamed: 0,date,home,away,home_goal,away_goal,home_shot,away_shot,home_shot_on_goal,away_shot_on_goal,home_possesion_pct,away_possesion_pct,home_corner,away_corner,result
0,2013-01-01,West Ham United,Norwich City,2,1,16,8,5,2,0.5,0.5,11,3,3
1,2013-01-01,Tottenham Hotspur,Reading,3,1,34,7,11,3,0.68,0.32,12,3,3
2,2013-01-01,Southampton,Arsenal,1,1,9,6,5,1,0.44,0.56,3,3,1
3,2013-01-01,Swansea City,Aston Villa,2,2,25,9,6,5,0.69,0.31,3,2,1
4,2013-01-01,Manchester City,Stoke City,3,0,20,5,6,1,0.62,0.38,7,2,3


In [8]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3639 entries, 0 to 3638
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   date                3639 non-null   object 
 1   home                3639 non-null   object 
 2   away                3639 non-null   object 
 3   home_goal           3639 non-null   int64  
 4   away_goal           3639 non-null   int64  
 5   home_shot           3639 non-null   int64  
 6   away_shot           3639 non-null   int64  
 7   home_shot_on_goal   3639 non-null   int64  
 8   away_shot_on_goal   3639 non-null   int64  
 9   home_possesion_pct  3639 non-null   float64
 10  away_possesion_pct  3639 non-null   float64
 11  home_corner         3639 non-null   int64  
 12  away_corner         3639 non-null   int64  
 13  result              3639 non-null   int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 398.1+ KB


Save clean data to csv file

In [9]:
df_data.to_csv("../dataset/cleaned_data.csv", index=False)