## Merge all dataframes

In [4]:
import pandas as pd
import numpy as np

import os

In [5]:
def merge_df(): 
    '''
    Description: This function merge all the csv files in a specific directory.
    
    Output:
        - csv files contains all the scraped data
    
    '''    
    df = pd.DataFrame()
    
    for file in os.listdir():
        if file.endswith('.csv'):
            aux = pd.read_csv(file, error_bad_lines=False)
            df = df.append(aux)
    
    return df

In [6]:
df = merge_df()

In [7]:
# all the cols
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14000 entries, 0 to 675
Data columns (total 41 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    14000 non-null  int64  
 1   Player        14000 non-null  object 
 2   #             0 non-null      float64
 3   Nation        0 non-null      float64
 4   Pos           0 non-null      float64
 5   Age           0 non-null      float64
 6   Min           14000 non-null  float64
 7   Gls           14000 non-null  int64  
 8   Ast           14000 non-null  int64  
 9   PK            14000 non-null  int64  
 10  PKatt         14000 non-null  int64  
 11  Sh            14000 non-null  int64  
 12  SoT           14000 non-null  int64  
 13  CrdY          14000 non-null  int64  
 14  CrdR          14000 non-null  int64  
 15  Touches       14000 non-null  float64
 16  Press         14000 non-null  float64
 17  Tkl           14000 non-null  float64
 18  Int           14000 non-null

## Preprocessings

In [8]:
# drop null and useless cols
cols = ['Unnamed: 0', '#', 'Nation', 'Pos', 'Age', 'Min']
df = df.drop(columns = cols)

In [9]:
# reorder the columns
col_list = ['League','Season','Date','Match','Team','Home or Away','Player','Possession', 'Save%', 'Gls', 'Ast', 'PK','PKatt','Sh','SoT','CrdY','CrdR','Touches','Press','Tkl','Int','Blocks',
            'xG','npxG','xA','SCA','GCA','Cmp','Att','Cmp%','Prog','Carries','Prog.1','Succ','Att.1']
df = df.reindex(columns=col_list)

In [10]:
# reset the index
df.reset_index(drop=True, inplace=True)

In [11]:
# remain only numerical value in the column 'Player'
df.rename(columns={'Player': '# Players'}, inplace = True)
a = list(map(lambda x: int(x.split(' ')[0]), df['# Players']))
df['# Players'] = a

In [12]:
# take a look at the processed dataset
df

Unnamed: 0,League,Season,Date,Match,Team,Home or Away,# Players,Possession,Save%,Gls,...,SCA,GCA,Cmp,Att,Cmp%,Prog,Carries,Prog.1,Succ,Att.1
0,Bundesliga,2017-2018,2017-08-18,Bayern_Munich_vs_Bayer_Leverkusen,Bayern Munich,Home,14,51,75.0,3,...,22.0,6.0,429.0,517.0,83.0,38.0,459.0,38.0,10.0,12.0
1,Bundesliga,2017-2018,2017-08-18,Bayern_Munich_vs_Bayer_Leverkusen,Bayer Leverkusen,Away,14,49,71.4,1,...,29.0,2.0,413.0,498.0,82.9,46.0,466.0,55.0,7.0,12.0
2,Bundesliga,2017-2018,2017-08-19,Hertha_BSC_vs_Stuttgart,Hertha BSC,Home,14,53,100.0,2,...,13.0,2.0,442.0,547.0,80.8,36.0,476.0,40.0,4.0,9.0
3,Bundesliga,2017-2018,2017-08-19,Hertha_BSC_vs_Stuttgart,Stuttgart,Away,14,47,50.0,0,...,11.0,0.0,383.0,481.0,79.6,27.0,405.0,36.0,6.0,9.0
4,Bundesliga,2017-2018,2017-08-19,Wolfsburg_vs_Dortmund,Wolfsburg,Home,14,30,40.0,0,...,14.0,0.0,159.0,303.0,52.5,16.0,191.0,7.0,5.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,Serie A,2020-2021,2021-05-02,Bologna_vs_Fiorentina,Fiorentina,Away,14,37,50.0,3,...,12.0,4.0,232.0,318.0,73.0,16.0,271.0,37.0,6.0,9.0
13996,Serie A,2020-2021,2021-05-02,Udinese_vs_Juventus,Udinese,Home,14,42,66.7,1,...,12.0,2.0,394.0,480.0,82.1,20.0,367.0,44.0,15.0,24.0
13997,Serie A,2020-2021,2021-05-02,Udinese_vs_Juventus,Juventus,Away,15,58,50.0,2,...,27.0,3.0,558.0,642.0,86.9,60.0,510.0,61.0,11.0,17.0
13998,Serie A,2020-2021,2021-05-02,Sampdoria_vs_Roma,Sampdoria,Home,15,54,100.0,2,...,23.0,4.0,401.0,513.0,78.2,38.0,354.0,35.0,2.0,7.0


## Save to csv 

In [13]:
df.to_csv('raw_data.csv')