In [3]:
from typing import Dict, List
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
import time
import re
from functools import reduce
import sys
from urllib.error import HTTPError 
from IPython.display import display

def get_data_info():
    # all possible leagues and seasons
    leagues = ['Premier League', 'La Liga', 'Serie A', 'Ligue 1', 'Bundesliga']
    seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024']
    
    # while True:
    #     # select league [Premier League / La Liga / Serie A / Ligue 1 / Bundesliga]
    #     league = input('Select League (Premier League / La Liga / Serie A / Ligue 1 / Bundesliga): ')
        
    #     # check if input valid
    #     if league not in leagues:
    #         print('League not valid, try again')
    #         continue
            
    #     # assign url names and id's
    #     if league == 'Premier League':
    #         league = 'Premier-League'
    #         league_id = '9'

    #     if league == 'La Liga':
    #         league = 'La-Liga'
    #         league_id = '12'

    #     if league == 'Serie A':
    #         league = 'Serie-A'
    #         league_id = '11'

    #     if league == 'Ligue 1':
    #         league = 'Ligue-1'
    #         league_id = '13'

    #     if league == 'Bundesliga':
    #         league = 'Bundesliga'
    #         league_id = '20'
    #     break
            
    # while True: 
    #     # select season after 2017 as XG only available from 2017,
    #     season = input('Select Season (2017-2018 to 2023-2024): ')
        
    #     # check if input valid
    #     if season not in seasons:
    #         print('Season not valid, try again')
    #         continue
    #     break
    league = 'La-Liga'
    league_id = '12'
    season = '2017-2018'


    url = f'https://fbref.com/en/comps/{league_id}/{season}/schedule/{season}-{league}-Scores-and-Fixtures'
    return url, league, season

def get_match_links(url, league):   
    print('Getting player data...')
    # access and download content from url containing all fixture links    
    match_links = []
    html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    links = soup(html.content, "html.parser").find_all('a')
    
    # filter list to return only needed links
    key_words_good = ['/en/matches/', f'{league}']
    for l in links:
        href = l.get('href', '')
        if all(x in href for x in key_words_good):
            if 'https://fbref.com' + href not in match_links:                 
                match_links.append('https://fbref.com' + href)
    return match_links

def player_data(match_links, league, season, num_matches=2):
    # loop through the first few fixtures for testing
    player_data = pd.DataFrame([])
    for count, link in enumerate(match_links[:num_matches]):
        try:
            html = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})
            tables = pd.read_html(html.content)
            for table in tables:
                try:
                    table.columns = table.columns.droplevel()
                except Exception:
                    continue

            # get player data
            def rename_columns(data_frames):
                prefixes = ['Summary_', 'Passing_', 'Pass_Types_', 'Defensive_Actions_', 'Possession_', 'Miscellaneous_Stats_', 'Goalkeeping_']

                for i, df in enumerate(data_frames):
                    if i == 0:  # table[3]
                        renamed = False
                        new_columns = []

                        for idx, col in enumerate(df.columns):
                            if col == 'Att':
                                if not renamed:
                                    new_columns.append('pass_' + col)
                                    renamed = True
                                else:
                                    new_columns.append('takes_on_' + col)  
                            else:
                                new_columns.append(col)  # Keep the original name for other columns

                        df.columns = new_columns  # Assign the new column names at once
                    elif i == 1:  # table[4]
                        new_columns = []
                        cmp_count = 0
                        att_count = 0
                        cmp_percent_count = 0

                        for col in df.columns:
                            if col == 'Cmp':
                                if cmp_count == 0:
                                    new_columns.append('total_' + col)
                                elif cmp_count == 1:
                                    new_columns.append('short_' + col)
                                elif cmp_count == 2:
                                    new_columns.append('medium_' + col)
                                elif cmp_count == 3:
                                    new_columns.append('long_' + col)
                                cmp_count += 1 

                            elif col == 'Att':
                                if att_count == 0:
                                    new_columns.append('total_' + col)
                                elif att_count == 1:
                                    new_columns.append('short_' + col)
                                elif att_count == 2:
                                    new_columns.append('medium_' + col)
                                elif att_count == 3:
                                    new_columns.append('long_' + col)
                                att_count += 1

                            elif col == 'Cmp%':
                                if cmp_percent_count == 0:
                                    new_columns.append('total_' + col)
                                elif cmp_percent_count == 1:
                                    new_columns.append('short_' + col)
                                elif cmp_percent_count == 2:
                                    new_columns.append('medium_' + col)
                                elif cmp_percent_count == 3:
                                    new_columns.append('long_' + col)
                                cmp_percent_count += 1

                            else:
                                new_columns.append(col)  # Keep original name

                        df.columns = new_columns  # Assign the new names at once            
                                
                    elif i == 3:  # table[6]
                        new_columns = []
                        tkl_count = 0

                        for col in df.columns:
                            if col == 'Tkl':
                                if tkl_count == 0:
                                    new_columns.append('total_' + col)
                                elif tkl_count == 1:
                                    new_columns.append('dribblers_' + col)
                                tkl_count += 1
                            else:
                                new_columns.append(col)

                        df.columns = new_columns  # Assign the new names at once

                    df.columns = [prefixes[i] + col if col not in ['Player', 'Nation', 'Age', 'Min'] else col for col in df.columns]
            def get_team_1_player_data():
                # data stored in separate tables 
                data_frames = [tables[i] for i in range(3, 10)]
                rename_columns(data_frames)
                
                
                # merge data
                df = reduce(lambda left, right: pd.merge(left, right, 
                    on=['Player', 'Nation', 'Age', 'Min'], how='outer'), data_frames).iloc[:-1]
                
                # assign a home or away value
                return df.assign(home=1, game_id=count)

            # get second team's player data        
            def get_team_2_player_data():
                data_frames = [tables[i] for i in range(10, 17)]
                rename_columns(data_frames)

                df = reduce(lambda left, right: pd.merge(left, right,
                    on=['Player', 'Nation', 'Age', 'Min'], how='outer'), data_frames).iloc[:-1]
                return df.assign(home=0, game_id=count)

            # combine both team data and export all match data to csv
            t1 = get_team_1_player_data()
            t2 = get_team_2_player_data()
            player_data = pd.concat([player_data, pd.concat([t1,t2]).reset_index()])
            
            print(f'{count+1}/{len(match_links[:num_matches])} matches collected')
            player_data.to_csv(f'Data/Test/FBREF_Dataset/{season.lower()}/{league.lower()}_{season.lower()}_player_data.csv', 
                header=True, index=False, mode='w')
        except Exception as e:
            print(f'{link}: error - {e}')
        # sleep for 3 seconds after every game to avoid IP being blocked
        time.sleep(3)
    
    # Show the head of the collected data for testing
    pd.set_option('display.max_columns', None)

    display(player_data)


In [4]:
# main function
def main(): 
    url, league, season = get_data_info()
    match_links = get_match_links(url, league)
    player_data(match_links, league, season)

    # checks if user wants to collect more data
    print('Data collected!')
    while True:
        answer = input('Do you want to collect more data? (yes/no): ')
        if answer == 'yes':
            main()
        if answer == 'no':
            sys.exit()
        else:
            print('Answer not valid')
            continue


if __name__ == '__main__':
    try:
        main()
    except HTTPError:
        print('The website refused access, try again later')
        time.sleep(5)

Getting player data...
1/2 matches collected
2/2 matches collected


Unnamed: 0,index,Player,Summary_#,Nation,Summary_Pos,Age,Min,Summary_Gls,Summary_Ast,Summary_PK,Summary_PKatt,Summary_Sh,Summary_SoT,Summary_CrdY,Summary_CrdR,Summary_Touches,Summary_Tkl,Summary_Int,Summary_Blocks,Summary_xG,Summary_npxG,Summary_xAG,Summary_SCA,Summary_GCA,Summary_Cmp,Summary_pass_Att,Summary_Cmp%,Summary_PrgP,Summary_Carries,Summary_PrgC,Summary_takes_on_Att,Summary_Succ,Passing_#,Passing_Pos,Passing_total_Cmp,Passing_total_Att,Passing_total_Cmp%,Passing_TotDist,Passing_PrgDist,Passing_short_Cmp,Passing_short_Att,Passing_short_Cmp%,Passing_medium_Cmp,Passing_medium_Att,Passing_medium_Cmp%,Passing_long_Cmp,Passing_long_Att,Passing_long_Cmp%,Passing_Ast,Passing_xAG,Passing_xA,Passing_KP,Passing_1/3,Passing_PPA,Passing_CrsPA,Passing_PrgP,Pass_Types_#,Pass_Types_Pos,Pass_Types_Att,Pass_Types_Live,Pass_Types_Dead,Pass_Types_FK,Pass_Types_TB,Pass_Types_Sw,Pass_Types_Crs,Pass_Types_TI,Pass_Types_CK,Pass_Types_In,Pass_Types_Out,Pass_Types_Str,Pass_Types_Cmp,Pass_Types_Off,Pass_Types_Blocks,Defensive_Actions_#,Defensive_Actions_Pos,Defensive_Actions_total_Tkl,Defensive_Actions_TklW,Defensive_Actions_Def 3rd,Defensive_Actions_Mid 3rd,Defensive_Actions_Att 3rd,Defensive_Actions_dribblers_Tkl,Defensive_Actions_Att,Defensive_Actions_Tkl%,Defensive_Actions_Lost,Defensive_Actions_Blocks,Defensive_Actions_Sh,Defensive_Actions_Pass,Defensive_Actions_Int,Defensive_Actions_Tkl+Int,Defensive_Actions_Clr,Defensive_Actions_Err,Possession_#,Possession_Pos,Possession_Touches,Possession_Def Pen,Possession_Def 3rd,Possession_Mid 3rd,Possession_Att 3rd,Possession_Att Pen,Possession_Live,Possession_Att,Possession_Succ,Possession_Succ%,Possession_Tkld,Possession_Tkld%,Possession_Carries,Possession_TotDist,Possession_PrgDist,Possession_PrgC,Possession_1/3,Possession_CPA,Possession_Mis,Possession_Dis,Possession_Rec,Possession_PrgR,Miscellaneous_Stats_#,Miscellaneous_Stats_Pos,Miscellaneous_Stats_CrdY,Miscellaneous_Stats_CrdR,Miscellaneous_Stats_2CrdY,Miscellaneous_Stats_Fls,Miscellaneous_Stats_Fld,Miscellaneous_Stats_Off,Miscellaneous_Stats_Crs,Miscellaneous_Stats_Int,Miscellaneous_Stats_TklW,Miscellaneous_Stats_PKwon,Miscellaneous_Stats_PKcon,Miscellaneous_Stats_OG,Miscellaneous_Stats_Recov,Miscellaneous_Stats_Won,Miscellaneous_Stats_Lost,Miscellaneous_Stats_Won%,Goalkeeping_SoTA,Goalkeeping_GA,Goalkeeping_Saves,Goalkeeping_Save%,Goalkeeping_PSxG,Goalkeeping_Cmp,Goalkeeping_Att,Goalkeeping_Cmp%,Goalkeeping_Att (GK),Goalkeeping_Thr,Goalkeeping_Launch%,Goalkeeping_AvgLen,Goalkeeping_Att.1,Goalkeeping_Launch%.1,Goalkeeping_AvgLen.1,Goalkeeping_Opp,Goalkeeping_Stp,Goalkeeping_Stp%,Goalkeeping_#OPA,Goalkeeping_AvgDist,home,game_id
0,0,14 Players,,,,,990,1,0,0,0,14,3,0,0,550,23,14,9,1.3,1.3,0.4,22,1,292,432,67.6,36,231,4,15,7,,,292,432,67.6,5388,2489,142,177,80.2,105,139,75.5,35,86,40.7,0,0.4,0.9,7,39,10,2,36,,,432,374,55,15,0,5,12,30,4,0,3,0,292,3,10,,,23,15,14,7,2,10,18,55.6,8,9,1,8,14,37,20,0,,,550,47,169,261,124,18,550,15,7,46.7,8,53.3,231,1312,484,4,5,0,24,11,290,36,,,0,0,0,17,16,3,12,14,15,0,1,0,55,30,16,65.2,,,,,,,,,,,,,,,,,,,,,1,0
1,1,Alexander,11.0,ar ARG,LW,28-309,90,0,0,0,0,2,0,0,0,36,1,1,2,0.1,0.1,0.2,5,0,19,27,70.4,2,18,0,1,1,11.0,LW,19,27,70.4,280,102,14,16,87.5,3,4,75.0,2,3,66.7,0,0.2,0.5,3,2,2,1,2,11.0,LW,27,23,3,2,0,0,3,0,1,0,1,0,19,1,2,11.0,LW,1,1,0,1,0,0,1,0.0,1,2,0,2,1,2,1,0,11.0,LW,36,1,4,13,20,3,36,1,1,100.0,0,0.0,18,112,32,0,0,0,3,3,25,5,11.0,LW,0,0,0,3,1,0,3,1,1,0,0,0,3,0,0,,,,,,,,,,,,,,,,,,,,,,1,0
2,2,Diego Rico,15.0,es ESP,LB,24-176,90,0,0,0,0,0,0,0,0,71,3,4,0,0.0,0.0,0.0,0,0,40,62,64.5,3,26,1,1,1,15.0,LB,40,62,64.5,703,320,22,28,78.6,15,24,62.5,3,6,50.0,0,0.0,0.0,0,2,1,0,3,15.0,LB,62,44,17,1,0,1,1,16,0,0,0,0,40,1,3,15.0,LB,3,0,3,0,0,1,1,100.0,0,0,0,0,4,7,2,0,15.0,LB,71,1,29,31,11,0,71,1,1,100.0,0,0.0,26,102,35,1,2,0,0,0,28,3,15.0,LB,0,0,0,2,1,0,1,4,0,0,1,0,9,1,1,50.0,,,,,,,,,,,,,,,,,,,,,1,0
3,3,Erik Morán,4.0,es ESP,DM,26-085,26,0,0,0,0,0,0,0,0,15,0,0,0,0.0,0.0,0.0,0,0,10,14,71.4,1,4,0,0,0,4.0,DM,10,14,71.4,148,36,7,8,87.5,3,4,75.0,0,1,0.0,0,0.0,0.0,0,1,0,0,1,4.0,DM,14,14,0,0,0,0,0,0,0,0,0,0,10,0,0,4.0,DM,0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,4.0,DM,15,0,3,9,3,0,15,0,0,,0,,4,21,1,0,0,0,0,0,10,0,4.0,DM,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,,,,,,,,,,,,,,,,,,,,,,1,0
4,4,Ezequiel Muñoz,19.0,ar ARG,CB,26-314,90,0,0,0,0,0,0,0,0,48,4,1,0,0.0,0.0,0.0,1,0,26,35,74.3,2,21,0,0,0,19.0,CB,26,35,74.3,542,158,4,6,66.7,12,16,75.0,7,10,70.0,0,0.0,0.0,0,0,0,0,2,19.0,CB,35,34,1,1,0,0,0,0,0,0,0,0,26,0,0,19.0,CB,4,3,4,0,0,3,5,60.0,2,0,0,0,1,5,7,0,19.0,CB,48,6,26,21,1,0,48,0,0,,0,,21,135,90,0,0,0,0,0,20,0,19.0,CB,0,0,0,0,2,0,0,1,3,0,0,0,4,7,3,70.0,,,,,,,,,,,,,,,,,,,,,1,0
5,5,Gabriel,8.0,br BRA,"DM,AM",23-334,90,1,0,0,0,5,2,0,0,67,3,0,0,0.9,0.9,0.1,3,0,30,51,58.8,5,33,0,4,3,8.0,"DM,AM",30,51,58.8,441,177,15,21,71.4,10,16,62.5,2,5,40.0,0,0.1,0.1,1,6,2,0,5,8.0,"DM,AM",51,49,2,2,0,1,2,0,0,0,0,0,30,0,2,8.0,"DM,AM",3,2,1,2,0,0,0,,0,0,0,0,0,3,1,0,8.0,"DM,AM",67,1,8,36,24,4,67,4,3,75.0,1,25.0,33,117,39,0,0,0,2,4,53,5,8.0,"DM,AM",0,0,0,2,3,0,2,0,2,0,0,0,5,8,5,61.5,,,,,,,,,,,,,,,,,,,,,1,0
6,6,Iván Cuéllar,1.0,es ESP,GK,33-083,90,0,0,0,0,0,0,0,0,32,0,0,0,0.0,0.0,0.0,0,0,14,31,45.2,0,14,0,0,0,1.0,GK,14,31,45.2,620,562,0,0,,5,5,100.0,9,26,34.6,0,0.0,0.0,0,1,0,0,0,1.0,GK,31,20,11,6,0,0,0,0,0,0,0,0,14,0,0,1.0,GK,0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,1.0,GK,32,24,31,1,0,0,32,0,0,,0,,14,142,85,0,0,0,0,0,9,0,1.0,GK,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,5.0,0.0,4.0,100.0,1.6,9.0,26.0,34.6,26.0,5.0,80.8,49.7,5.0,100.0,67.8,8.0,0.0,0.0,0.0,,1,0
7,7,Javier Eraso,17.0,es ESP,"AM,RW",27-149,69,0,0,0,0,0,0,0,0,33,2,2,0,0.0,0.0,0.0,1,0,14,23,60.9,2,15,1,0,0,17.0,"AM,RW",14,23,60.9,188,42,10,12,83.3,4,6,66.7,0,3,0.0,0,0.0,0.0,0,2,0,0,2,17.0,"AM,RW",23,20,2,0,0,0,2,0,1,0,1,0,14,1,0,17.0,"AM,RW",2,1,1,1,0,1,1,100.0,0,0,0,0,2,4,0,0,17.0,"AM,RW",33,0,5,18,10,3,33,0,0,,0,,15,65,19,1,1,0,6,1,17,6,17.0,"AM,RW",0,0,0,2,1,0,2,2,1,0,0,0,8,1,0,100.0,,,,,,,,,,,,,,,,,,,,,1,0
8,8,Joseba Zaldúa,20.0,es ESP,RB,25-055,79,0,0,0,0,0,0,0,0,36,0,0,2,0.0,0.0,0.0,1,0,25,33,75.8,2,15,1,1,0,20.0,RB,25,33,75.8,422,210,13,16,81.3,7,8,87.5,3,7,42.9,0,0.0,0.0,0,4,1,1,2,20.0,RB,33,23,10,1,0,0,1,9,0,0,0,0,25,0,0,20.0,RB,0,0,0,0,0,0,1,0.0,1,2,0,2,0,0,0,0,20.0,RB,36,1,7,22,8,1,36,1,0,0.0,1,100.0,15,145,38,1,1,0,2,0,17,1,20.0,RB,0,0,0,0,1,0,1,0,0,0,0,0,4,0,2,0.0,,,,,,,,,,,,,,,,,,,,,1,0
9,9,Martín Mantovani,5.0,ar ARG,CB,33-042,90,0,0,0,0,0,0,0,0,47,1,2,0,0.0,0.0,0.0,0,0,27,35,77.1,4,16,0,0,0,5.0,CB,27,35,77.1,524,222,10,10,100.0,16,18,88.9,1,7,14.3,0,0.0,0.0,0,2,0,0,4,5.0,CB,35,34,1,1,0,0,0,0,0,0,0,0,27,0,0,5.0,CB,1,0,1,0,0,1,1,100.0,0,0,0,0,2,3,9,0,5.0,CB,47,9,28,19,0,0,47,0,0,,0,,16,73,19,0,0,0,1,0,18,0,5.0,CB,0,0,0,1,0,0,0,2,0,0,0,0,3,6,2,75.0,,,,,,,,,,,,,,,,,,,,,1,0


Data collected!


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
