## DATA was SCRAPED from FBREF website
* standings_url = "https://fbref.com/en/comps/9/1631/2017-2018-Premier-League-Stats"

In [1]:
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings 
warnings.filterwarnings('ignore')

In [2]:
import time

In [3]:
def grab_data(pattern,data_main ,match):
    soup = BeautifulSoup(data_main.text)
    links = soup.find_all('a')
    links = [l.get("href") for l in links]
    links = [l for l in links if l and 'all_comps/'+pattern+'/' in l]
    data_in = requests.get(f"https://fbref.com{links[0]}")
    df = pd.read_html(data_in.text, match=match)[0]
    print(f"Got {match} data sucessfully")
    time.sleep(1)
    return df

In [4]:
def clean_data(df,renamed_col,df_name):
    df = df.copy()
    df.columns = df.columns.droplevel()
    df.columns = renamed_col
    df.drop(['Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent','Match Report'],axis=1,inplace=True)
    print(f"Shape  {df_name} : " ,df.shape)
    return df

In [5]:
    Shooting_common_col = ['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent']
    Shooting_Standard_col = ['Standard_'+col for col in   ['Gls', 'Sh', 'SoT', 'SoT%', 'G/Sh', 'G/SoT', 'Dist', 'FK','PK', 'PKatt']] 
    Shooting_Expected_col =  ['Expected_'+col for col in ['xG', 'npxG', 'npxG/Sh', 'G-xG', 'np:G-xG']] 
    Shooting_renamed_col = Shooting_common_col + Shooting_Standard_col + Shooting_Expected_col + ['Match Report']

In [6]:
    GSA_common_col = ['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent']
    GSA_SCA_Types_col = ['SCA_Types_'+col for col in   ['SCA', 'PassLive', 'PassDead', 'Drib', 'Sh', 'Fld', 'Def']] 
    GSA_GCA_Types_col =  ['GCA_Types_'+col for col in ['GCA', 'PassLive', 'PassDead', 'Drib', 'Sh', 'Fld', 'Def']] 
    GSA_renamed_col= GSA_common_col + GSA_SCA_Types_col + GSA_GCA_Types_col + ['Match Report']

In [7]:
    Defensive_Actions_common_col = ['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent']
    Defensive_Actions_Tackles_col = ['Tackles_'+col for col in   ['Tkl', 'TklW', 'Def_3rd', 'Mid_3rd', 'Att_3rd']] 
    Defensive_Actions_Vs_Dribbles_col =  ['Vs_Dribbles_'+col for col in [ 'Tkl','Att', 'Tkl%', 'Past',]]
    Defensive_Actions_Pressures_col =  ['Pressures_'+col for col in ['Press', 'Succ', '%', 'Def_3rd', 'Mid_3rd','Att_3rd']]
    Defensive_Actions_Blocks_col =  ['Blocks_'+col for col in ['Blocks', 'Sh', 'ShSv', 'Pass']]
    Defensive_Actions_Def_col =  ['Def_'+col for col in ['Int', 'Tkl+Int', 'Clr','Err']]
    Defensive_Actions_renamed_col= Defensive_Actions_common_col + Defensive_Actions_Tackles_col + Defensive_Actions_Vs_Dribbles_col + Defensive_Actions_Pressures_col + Defensive_Actions_Blocks_col + Defensive_Actions_Def_col + ['Match Report']

In [8]:
    Goalkeeping_cols=['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent']
    gk_abrevations = ['GK_Perf_','GK_Penalty_','GK_Launch_','GK_Passes_','Gk_Goal_Kk_','GK_Crosses_','Gk_Sweeper_']
    GK_listings = [
        ['SoTA', 'GA', 'Saves', 'Save%', 'CS', 'PSxG', 'PSxG+/-'],#GK_Perf_
        ['PKatt', 'PKA', 'PKsv', 'PKm'],#GK_Penalty_
        ['Cmp', 'Att', 'Cmp%'],#GK_Launch_
        ['Att', 'Thr','Launch%', 'AvgLen'],#GK_Passes_
        ['Att', 'Launch%', 'AvgLen'],#Gk_Goal_Kk_
        ['Opp', 'Stp', 'Stp%'],#GK_Crosses_
        ['#OPA', 'AvgDist'],#Gk_Sweeper_
    ]

    for abv,listings in zip(gk_abrevations,GK_listings):
        new_cols = [abv+col for col in listings]
        Goalkeeping_cols.extend(new_cols)
    Goalkeeping_renamed_col = Goalkeeping_cols + ['Match Report']


In [9]:
    Passing_cols=['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent']
    passing_abrevations = ['Passing_Total_','Passing_Short_','Passing_Medium_','Passing_Long_','Passing_']
    passing_listings = [
        ['Cmp', 'Att', 'Cmp%', 'TotDist', 'PrgDist'],#Passing_Total_
        ['Cmp', 'Att','Cmp%'],#Passing_Short_
        ['Cmp', 'Att', 'Cmp%'],#'Passing_Medium_'
        ['Cmp', 'Att', 'Cmp%'],#Passing_Long_
        ['Ast', 'xA', 'KP','1/3', 'PPA', 'CrsPA', 'Prog'],#Passing_
    ]

    for abv,listings in zip(passing_abrevations,passing_listings):
        new_cols = [abv+col for col in listings]
        Passing_cols.extend(new_cols)
    Passing_renamed_col = Passing_cols + ['Match Report']


In [10]:
    PassingType_cols=['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent','Pass_Att']
    passingType_abrevations = ['PassType_','Corner_','Pass_Height_','PassWith_','Pass_Outome_']
    passingType_listings = [
        ['Live', 'Dead', 'FK', 'TB', 'Press', 'Sw', 'Crs','CK'],#PassType_
        ['In', 'Out', 'Str'],#Corner_
        [ 'Ground', 'Low', 'High'],#Pass_Height_
        ['Left', 'Right','Head', 'TI','Other'],#PassWith_
        ['Cmp', 'Off', 'Out', 'Int', 'Blocks'],#Pass_Outome_
    ]

    for abv,listings in zip(passingType_abrevations,passingType_listings):
        new_cols = [abv+col for col in listings]
        PassingType_cols.extend(new_cols)
    PassingType_renamed_col = PassingType_cols + ['Match Report']

In [11]:
    Possession_cols=['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent', 'Poss']
    Possession_abrevations = ['Touches_','Dribbles_','Carries_','Receiving_',]
    Possession_listings = [
        ['Touches', 'Def_Pen', 'Def_3rd', 'Mid_3rd', 'Att_3rd', 'Att_Pen', 'Live'],#Touches_
        ['Succ', 'Att', 'Succ%', '#Pl', 'Megs'],#Dribbles_
        [ 'Carries', 'TotDist', 'PrgDist', 'Prog', '1/3', 'CPA', 'Mis', 'Dis'],#Carries_
        ['Targ', 'Rec', 'Rec%', 'Prog'],#Receiving_
    ]

    for abv,listings in zip(Possession_abrevations,Possession_listings):
        new_cols = [abv+col for col in listings]
        Possession_cols.extend(new_cols)
    Possession_renamed_col = Possession_cols + ['Match Report']

In [12]:
    Miscellaneous_Stats_cols=['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA','Opponent',]
    Miscellaneous_Stats_abrevations = ['Misc_Stats_','Misc_Stats_Aerial_Duels']
    Miscellaneous_Stats_listings = [
        ['CrdY', 'CrdR', '2CrdY', 'Fls', 'Fld', 'Off', 'Crs', 'Int','TklW', 'PKwon', 'PKcon', 'OG', 'Recov'],#Misc_Stats_
        ['Won', 'Lost', 'Won%'],#Misc_Stats_Aerial_Duels
    ]

    for abv,listings in zip(Miscellaneous_Stats_abrevations,Miscellaneous_Stats_listings):
        new_cols = [abv+col for col in listings]
        Miscellaneous_Stats_cols.extend(new_cols)
    Miscellaneous_Stats_renamed_col = Miscellaneous_Stats_cols + ['Match Report']

In [13]:
standings_url = "https://fbref.com/en/comps/9/1631/2017-2018-Premier-League-Stats"

In [14]:
years = list(range(2018, 2014, -1))
all_matches = []

In [15]:
years

[2018, 2017, 2016, 2015]

In [16]:
year_path_dict = dict()

In [17]:
    for year in years:    
        print('Year === : ',year)
        data = requests.get(standings_url)
        soup = BeautifulSoup(data.text)
        standings_table = soup.select('table.stats_table')[0]

        links = [l.get("href") for l in standings_table.find_all('a')]
        links = [l for l in links if '/squads/' in l]
        team_urls = [f"https://fbref.com{l}" for l in links]

        previous_season = soup.select("a.prev")[0].get("href")
        standings_url = f"https://fbref.com{previous_season}"
        team_paths = []
        for team_url in team_urls:
            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
            data = requests.get(team_url)
            matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
            soup = BeautifulSoup(data.text)
            print(f"Starting {year} {team_name} Process")
            a = matches.copy()

            shooting = grab_data(pattern='shooting',data_main=data ,match='Shooting')
            b = clean_data(df=shooting,renamed_col=Shooting_renamed_col,df_name='Shooting')

            GSA = grab_data(pattern='gca',data_main=data ,match="Goal and Shot")
            c = clean_data(df=GSA,renamed_col=GSA_renamed_col,df_name='GSA')

            DEF = grab_data(pattern='defense',data_main=data ,match="Defensive Actions")
            d = clean_data(df=DEF,renamed_col=Defensive_Actions_renamed_col,df_name='Defensive Actions')

            goalie = grab_data(pattern='keeper', data_main=data, match="Goalkeeping")
            e = clean_data(df=goalie,renamed_col=Goalkeeping_renamed_col,df_name='Goalkeeping')

            passing = grab_data(pattern='passing', data_main=data, match="Passing")
            f = clean_data(df=passing, renamed_col = Passing_renamed_col ,df_name='Passing')

            pass_types = grab_data(pattern='passing_types', data_main=data, match="Pass Types")
            g = clean_data(df=pass_types,renamed_col=PassingType_renamed_col,df_name='PassingType')

            possession = grab_data(pattern='possession', data_main=data, match="Possession")
            h = clean_data(df=possession,renamed_col=Possession_renamed_col,df_name='Possession')

            Misc_data = grab_data(pattern='misc', data_main=data, match="Miscellaneous Stats")
            i = clean_data(df=Misc_data,renamed_col=Miscellaneous_Stats_renamed_col ,df_name='Miscellaneous_Stats')

            #merge
            #a['Team'] = team_name 
            try:
                #team_data = matches.merge(shooting[shooting.columns.drop(drops)], on="Date")
                a['Team'] = team_name 
                #b['Team'] = team_name 
                #b.drop(['Date'],axis=1,inplace=True)
                ab = a.merge(b, on="Date")
                abc = ab.merge(c, on="Date")
                abcd = abc.merge(d, on="Date")
                abcde = abcd.merge(e, on="Date")
                #f['Team'] = team_name
                #f.drop(['Date'],axis=1,inplace=True)
                abcdef = abcde.merge(f, on="Date")
                abcdefg = abcdef.merge(g, on="Date")
                abcdefgh = abcdef.merge(h, on="Date")
                abcdefghi = abcdef.merge(i, on="Date")
                print(f"Merged df shape of {year} {team_name} : ",abcdefghi.shape)
                team_data = abcdefghi
            except ValueError:
                continue
            #team_data = team_data[team_data["Comp"] == "Premier League"]
            team_data["Season"] = year
            path = str(year)+team_name+".csv" 
            team_paths.append(path)
            team_data.to_csv(path)
            all_matches.append(team_data)
            time.sleep(1)
        year_path_dict[year]=team_paths

Year === :  2018
Starting 2018 Manchester City Process
Got Shooting data sucessfully
Shape  Shooting :  (58, 16)
Got Goal and Shot data sucessfully
Shape  GSA :  (58, 15)
Got Defensive Actions data sucessfully
Shape  Defensive Actions :  (58, 24)
Got Goalkeeping data sucessfully
Shape  Goalkeeping :  (58, 27)
Got Passing data sucessfully
Shape  Passing :  (58, 22)
Got Pass Types data sucessfully
Shape  PassingType :  (58, 26)
Got Possession data sucessfully
Shape  Possession :  (58, 26)
Got Miscellaneous Stats data sucessfully
Shape  Miscellaneous_Stats :  (58, 17)
Merged df shape of 2018 Manchester City :  (57, 135)
Starting 2018 Manchester United Process
Got Shooting data sucessfully
Shape  Shooting :  (57, 16)
Got Goal and Shot data sucessfully
Shape  GSA :  (57, 15)
Got Defensive Actions data sucessfully
Shape  Defensive Actions :  (57, 24)
Got Goalkeeping data sucessfully
Shape  Goalkeeping :  (57, 27)
Got Passing data sucessfully
Shape  Passing :  (57, 22)
Got Pass Types data suc

Shape  PassingType :  (42, 26)
Got Possession data sucessfully
Shape  Possession :  (42, 26)
Got Miscellaneous Stats data sucessfully
Shape  Miscellaneous_Stats :  (42, 17)
Merged df shape of 2018 Watford :  (41, 135)
Starting 2018 Brighton and Hove Albion Process
Got Shooting data sucessfully
Shape  Shooting :  (45, 16)
Got Goal and Shot data sucessfully
Shape  GSA :  (45, 15)
Got Defensive Actions data sucessfully
Shape  Defensive Actions :  (45, 24)
Got Goalkeeping data sucessfully
Shape  Goalkeeping :  (45, 27)
Got Passing data sucessfully
Shape  Passing :  (45, 22)
Got Pass Types data sucessfully
Shape  PassingType :  (45, 26)
Got Possession data sucessfully
Shape  Possession :  (45, 26)
Got Miscellaneous Stats data sucessfully
Shape  Miscellaneous_Stats :  (45, 17)
Merged df shape of 2018 Brighton and Hove Albion :  (44, 135)
Starting 2018 Huddersfield Town Process
Got Shooting data sucessfully
Shape  Shooting :  (45, 16)
Got Goal and Shot data sucessfully
Shape  GSA :  (45, 15)


ValueError: Length mismatch: Expected axis has 20 elements, new values have 26 elements