In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
def get_column_name(val):
    name = ''
    if val.get('data-over-header'):
        name = val.get('data-over-header').strip()+' '
    return name + val.text.strip()

In [3]:
def get_url(html):
    resp = html.find("a", href=True)['href']
    return resp

In [4]:
def get_table(soup, id, ano):
    tbl = soup.find("table", id=id)
    
    tbl_h = tbl.find("thead")
    tbl_b = tbl.find("tbody")
    fields = [get_column_name(a) for a in tbl_h.find_all('tr')[1].find_all('th')]
    table_rows = tbl_b.find_all('tr')
    
    res = []
    
    for tr in table_rows:
        res.append(
            [get_url(tr.find("th"))] + [a.text.strip() for i, a in enumerate(tr.find_all('td'))]
        )
    
    df = pd.DataFrame(
        res
        , columns=fields
    ).assign(
        **{'Ano': ano}
    )
    
    return df

In [44]:
def get_all_tables(soup, ano):
    df_standard = get_table(soup, "stats_squads_standard_for", ano)
    df_keeper = get_table(soup, 'stats_squads_keeper_for', ano)
    try:
        df_keeper_adv = get_table(soup, 'stats_squads_keeper_adv_for', ano)
    except:
        df_keeper_adv = pd.DataFrame()
    try:
        df_shooting = get_table(soup, 'stats_squads_shooting_for', ano)
    except:
        df_shooting = pd.DataFrame()
    try:
        df_passing = get_table(soup, 'stats_squads_passing_for', ano)
    except:
        df_passing = pd.DataFrame()
    try:
        df_passing_types = get_table(soup, 'stats_squads_passing_types_for', ano)
    except:
        df_passing_types = pd.DataFrame()
    try:
        df_gca = get_table(soup, 'stats_squads_gca_for', ano)
    except:
        df_gca = pd.DataFrame()
    try:
        df_defense = get_table(soup, 'stats_squads_defense_for', ano)
    except:
        df_defense = pd.DataFrame()
    try:
        df_possession = get_table(soup, 'stats_squads_possession_for', ano)
    except:
        df_possession = pd.DataFrame()
    try:
        df_playing_time = get_table(soup, 'stats_squads_playing_time_for', ano)
    except:
        df_playing_time = pd.DataFrame()
    try:
        df_misc = get_table(soup, 'stats_squads_misc_for', ano)
    except:
        df_misc = pd.DataFrame()
    
    return (
        df_standard
        , df_keeper
        , df_keeper_adv
        , df_shooting
        , df_passing
        , df_passing_types
        , df_gca
        , df_defense
        , df_possession
        , df_playing_time
        , df_misc
    )

In [45]:
def get_temporadas(url, r):
    df_standard_f = pd.DataFrame()
    df_keeper_f = pd.DataFrame()
    df_keeper_adv_f = pd.DataFrame()
    df_shooting_f = pd.DataFrame()
    df_passing_f = pd.DataFrame()
    df_passing_types_f = pd.DataFrame()
    df_gca_f = pd.DataFrame()
    df_defense_f = pd.DataFrame()
    df_possession_f = pd.DataFrame()
    df_playing_time_f = pd.DataFrame()
    df_misc_f = pd.DataFrame()
    
    for a in r:
        print(a)
        time.sleep(2)
        r = requests.get(
            url.format(ano=a)
        )
        df_standard, df_keeper, df_keeper_adv, df_shooting, df_passing, df_passing_types, df_gca, df_defense, df_possession, df_playing_time, df_misc = get_all_tables(BeautifulSoup(r.text, 'lxml'), a)
        
        df_standard_f = pd.concat([df_standard_f, df_standard], ignore_index=True)
        df_keeper_f = pd.concat([df_keeper_f, df_keeper], ignore_index=True)
        df_keeper_adv_f = pd.concat([df_keeper_adv_f, df_keeper_adv], ignore_index=True)
        df_shooting_f = pd.concat([df_shooting_f, df_shooting], ignore_index=True)
        df_passing_f = pd.concat([df_passing_f, df_passing], ignore_index=True)
        df_passing_types_f = pd.concat([df_passing_types_f, df_passing_types], ignore_index=True)
        df_gca_f = pd.concat([df_gca_f, df_gca], ignore_index=True)
        df_defense_f = pd.concat([df_defense_f, df_defense], ignore_index=True)
        df_possession_f = pd.concat([df_possession_f, df_possession], ignore_index=True)
        df_playing_time_f = pd.concat([df_playing_time_f, df_playing_time], ignore_index=True)
        df_misc_f = pd.concat([df_misc_f, df_misc], ignore_index=True)
    
    return (df_standard_f, df_keeper_f, df_keeper_adv_f, df_shooting_f, df_passing_f, df_passing_types_f, df_gca_f, df_defense_f, df_possession_f, df_playing_time_f, df_misc_f)

In [46]:
url = 'https://fbref.com/pt/comps/24/{ano}/{ano}-Serie-A-estatisticas'

In [47]:
df_standard_f, df_keeper_f, df_keeper_adv_f, df_shooting_f, df_passing_f, df_passing_types_f, df_gca_f, df_defense_f, df_possession_f, df_playing_time_f, df_misc_f = get_temporadas(url, range(2014, 2023))

2014
2015
2016
2017
2018
2019
2020
2021
2022


In [53]:
df_standard_f.to_parquet('{name}_{camp}.parquet'.format(name='standard', camp=24))
df_keeper_f.to_parquet('{name}_{camp}.parquet'.format(name='keeper', camp=24))

df_keeper_adv_f.columns = ['Equipe', '# J', 'Tempo de jogo 90s', 'Gols GC', 'Gols GPC', 'Gols TD',
       'Gols GCC', 'Gols OG', 'Esperado PSxG', 'Esperado PSxG/SoT',
       'Esperado PSxG+/-', 'Esperado /90', 'Lançados Cmp', 'Lançados Att',
       'Lançados Cmp%', 'Passes Att', 'Passes Pas', 'Passes Lançamentos%',
       'Passes CompMéd', 'Tiro de meta Att', 'Tiro de meta Lançamentos%',
       'Tiro de meta CompMéd', 'Cruzamentos Oponente', 'Cruzamentos Stp',
       'Cruzamentos Stp%', 'Sweeper #OPA', 'Sweeper #OPA/90',
       'Sweeper DistMéd', 'Ano']

df_keeper_adv_f.to_parquet('{name}_{camp}.parquet'.format(name='keeper_adv', camp=24))
df_shooting_f.to_parquet('{name}_{camp}.parquet'.format(name='shooting', camp=24))
df_passing_f.to_parquet('{name}_{camp}.parquet'.format(name='passing', camp=24))
df_passing_types_f.to_parquet('{name}_{camp}.parquet'.format(name='passing_types', camp=24))
df_gca_f.to_parquet('{name}_{camp}.parquet'.format(name='gca', camp=24))
df_defense_f.to_parquet('{name}_{camp}.parquet'.format(name='defense', camp=24))
df_possession_f.to_parquet('{name}_{camp}.parquet'.format(name='possession', camp=24))
df_playing_time_f.to_parquet('{name}_{camp}.parquet'.format(name='playing_time', camp=24))
df_misc_f.to_parquet('{name}_{camp}.parquet'.format(name='misc', camp=24))

In [None]:
r = requests.get(
    url
)

In [None]:
soup = BeautifulSoup(r.text, 'lxml')

In [None]:
# get_table(soup, "stats_squads_standard_for")
# get_table(soup, 'stats_squads_keeper_for')
# get_table(soup, 'stats_squads_keeper_adv_for')
# get_table(soup, 'stats_squads_shooting_for')
# get_table(soup, 'stats_squads_passing_for')
# get_table(soup, 'stats_squads_passing_types_for')
# get_table(soup, 'stats_squads_gca_for')
# get_table(soup, 'stats_squads_defense_for')
# get_table(soup, 'stats_squads_possession_for')
# get_table(soup, 'stats_squads_playing_time_for')
# get_table(soup, 'stats_squads_misc_for')

In [None]:
tbl = soup.find("table", id="stats_squads_standard_for")

In [None]:
tbl_h = tbl.find("thead")
tbl_b = tbl.find("tbody")
fields = [get_column_name(a) for a in tbl_h.find_all('tr')[1].find_all('th')]
table_rows = tbl_b.find_all('tr')

res = []

for tr in table_rows:
    res.append(
        # [get_url(tr.find("th"))] + [get_url(a) if i == 1 else a.text.strip() for i, a in enumerate(tr.find_all('td'))]
        # [a.text.strip() for i, a in enumerate(tr.find_all('th'))]
        [get_url(tr.find("th"))] + [a.text.strip() for i, a in enumerate(tr.find_all('td'))]
    )
    
df = pd.DataFrame(
    res
    , columns=fields
)

In [None]:
df

In [None]:
'stats_squads_keeper_for'