In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
import re
pd.set_option('display.max_columns', 500)

## Scrap Function

In [3]:
def ScrapMatch(url):

    # Faites une requête GET pour obtenir le contenu de la page
    response = requests.get(url)
    html_content = response.content

    # Utilisez Beautiful Soup pour analyser le contenu HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Configuration des sélecteurs des statistiques générales du match
    selectors = [
        {"id": "Home_Team", "selector": ".scorebox div:nth-of-type(1) div strong a"}, # Team Home
        {"id": "Away_Team", "selector": ".scorebox div:nth-of-type(2) strong a"}, # Team Away
        {"id": "Competition", "selector": ".box > div:nth-of-type(1)"}, # Competition
        {"id": "Home_Season_History", "selector": ".scorebox div:nth-of-type(1) div:nth-of-type(3)"}, # Season History Home
        {"id": "Away_Season_History", "selector": ".scorebox div:nth-of-type(2) div:nth-of-type(3)"}, # Season History Away
        {"id": "Home_Goals", "selector": "div:nth-of-type(1) div.score"}, # Goals Home
        {"id": "Away_Goals", "selector": ".scorebox div:nth-of-type(2) .scores div.score"}, # Goals Away
        {"id": "Home_xG", "selector": "div:nth-of-type(1) div.score_xg"}, # xG Home
        {"id": "Away_xG", "selector": ".scorebox div:nth-of-type(2) .scores div.score_xg"}, # xG Away
        {"id": "Home_Penalties", "selector": "div:nth-of-type(1) div.score_pen"}, # Penalties Home
        {"id": "Away_Penalties", "selector": ".scorebox div:nth-of-type(2) .scores div.score_pen"}, # Penalties Away
        {"id": "Home_Manager", "selector": "div:nth-of-type(1) div.datapoint:nth-of-type(5)"}, # Manager Home
        {"id": "Away_Manager", "selector": ".scorebox div:nth-of-type(2) div:nth-of-type(5)"}, # Manager Away
        {"id": "Captain_Home", "selector": "div:nth-of-type(1) .datapoint a"}, # Captain Home
        {"id": "Captain_Away", "selector": ".scorebox div:nth-of-type(2) .datapoint a"}, # Captain Away
        {"id": "Date", "selector": ".scorebox_meta strong a"}, # Date
        {"id": "Time", "selector": "span.venuetime"}, # Time
        {"id": "Affluence-Stade-Arbitres", "selector": ".scorebox_meta div:nth-of-type(n+5)"}, # Affluence / Stade / Arbitres
        {"id" : "Home_Lineup", "selector" : "div#a.lineup"}, # Compo Home
        {"id" : "Away_Lineup", "selector" : "div#b.lineup"}, # Compo Away
        {"id" : "General_Statistics", "selector" : "#team_stats_extra div:nth-of-type(n+4)"}, # Stats générale
        {"id" : "General_Statistics_2", "selector" : "#team_stats tr:nth-of-type(n+2) th, td > div > div:nth-of-type(1)"}, # Stats générale 2
        {"id" : "Home_Events", "selector" : "div.a:nth-of-type(n+3) > div"}, # Home résumé
        {"id" : "Away_Events", "selector" : "div.b:nth-of-type(n+3) > div"}, # Away résumé
        {"id" : "Home_Events_Penalties", "selector" : "div.a:nth-of-type(n+34)"}, # Home tirs aux buts
        {"id" : "Away_Events_Penalties", "selector" : "div.b:nth-of-type(n+33)"} # Away tirs aux buts
    ]

    # Configuration des sélecteurs des tables statistiques des équipes
    selectors_table =[
        {"id": "Tables_Stats_Summary", "selector": "table[id*=summary]"}, # Table résumé
        {"id": "Tables_Stats_Passing", "selector": "table[id*=passing]"}, # Table Passes
        {"id": "Tables_Stats_Passing_Types", "selector": "table[id*=passing_types]"}, # Tables Type de passe
        {"id": "Tables_Stats_Defense", "selector": "table[id*=defense]"}, # Table Actions défensive
        {"id": "Tables_Stats_Possession", "selector": "table[id*=possession]"}, # Table possession
        {"id": "Tables_Stats_Misc", "selector": "table[id*=misc]"}, # Table Statistiques diverse
        {"id": "Tables_Stats_Keeper", "selector": "table[id*=keeper]"}, # Table Goals
        {"id": "Tables_Stats_Shot", "selector": "table[id*=shots_all]"} # Table Tirs
    ]

    # Récupération des statistiques générales du match avec les sélecteurs
    data_dict = {}
    for selector_info in selectors:
        element = soup.select(selector_info["selector"])
        if element:
            if len(element) == 1:
                data_dict[selector_info["id"]] = element[0].get_text()
            else:
                data_dict[selector_info["id"]] = [item.get_text() for item in element]

    # Récupération des tables avec les sélecteurs
    data_table = {}
    for selector_info in selectors_table:
        elements = soup.select(selector_info["selector"])  
        if elements:
            i=0
            for element in elements:
                table_df = pd.read_html(str(element))[0]
                if selector_info["id"] != "Tables_Stats_Shot" :
                    if i == 0 :
                        data_table["Home_"+selector_info["id"]] = table_df
                    else:
                        data_table["Away_"+selector_info["id"]] = table_df
                else:
                        data_table[selector_info["id"]] = table_df
                i+=1

    # Renommage des colonnes des dataframes
    for el in data_table:
        data_table[el].columns = data_table[el].columns.map('_'.join)
        data_table[el] = data_table[el].reset_index(drop=True)
        table_headers = data_table[el].columns.tolist()
        new_headers = [element.split("Unnamed")[1].split("_", 3)[-1] if "Unnamed" in element else element for element in table_headers]
        data_table[el].columns = new_headers

    # Séparation des tables en vu du merge
    Home_Players_Stats = [value for key, value in data_table.items() if "Home" in key and "Keeper" not in key]
    Away_Players_Stats = [value for key, value in data_table.items() if "Away" in key and "Keeper" not in key]
    Home_Goalkeeper_Stats = [value for key, value in data_table.items() if "Home_Tables_Stats_Keeper" in key]
    Away_Goalkeeper_Stats = [value for key, value in data_table.items() if "Away_Tables_Stats_Keeper" in key]
    Shots_Stats = [value for key, value in data_table.items() if "Shot" in key]
    if Home_Goalkeeper_Stats: Home_Goalkeeper_Stats = Home_Goalkeeper_Stats[0]
    if Away_Goalkeeper_Stats: Away_Goalkeeper_Stats = Away_Goalkeeper_Stats[0]
    if Shots_Stats: Shots_Stats = Shots_Stats[0]

    # Merge pour la liste data_table_home
    if Home_Players_Stats:
        df = Home_Players_Stats[0]
        for i in range(1, len(Home_Players_Stats)):
            common_cols = set(df.columns).intersection(Home_Players_Stats[i].columns)
            df = df.merge(Home_Players_Stats[i], how='inner', left_on=list(common_cols), right_on=list(common_cols))
        Home_Players_Stats = df.copy()

    # Merge pour la liste data_table_away
    if Away_Players_Stats:
        df = Away_Players_Stats[0]
        for i in range(1, len(Away_Players_Stats)):
            common_cols = set(df.columns).intersection(Away_Players_Stats[i].columns)
            df = df.merge(Away_Players_Stats[i], how='inner', left_on=list(common_cols), right_on=list(common_cols))
        Away_Players_Stats = df.copy()

    # Transformation des dataframes statistiques en dictionnaire et ajout au dictionnaire final
    lst_df = [Home_Players_Stats, Away_Players_Stats, Home_Goalkeeper_Stats, Away_Goalkeeper_Stats, Shots_Stats]

    for df in lst_df:
        result_dict = {}

        if isinstance(df, pd.DataFrame):

            for index, row in df.iterrows():
                key = row['Joueur']
                values_dict = row.drop('Joueur').to_dict()
                result_dict[key] = values_dict

            #df_name = [name for name in globals() if globals()[name] is df][0]
            df_name = [name for name, var in locals().items() if var is df][0]


            if df_name in data_dict:
                data_dict[df_name].update(result_dict)
            else:
                data_dict[df_name] = result_dict

    return data_dict

## Cleaning function

In [4]:
def CleanMatch(match):

    # ------- Competition ------- #
    competition = re.split(r'\s+\(', match['Competition'])
    competition = [partie.strip('()') for partie in competition]
    match['Competition'] = competition[0]
    index_competition = list(match.keys()).index('Competition')
    match['Matchweek'] = competition[1]
    match = dict(list(match.items())[:index_competition + 1] + [('Matchweek', competition[1])] + list(match.items())[index_competition + 1:])
    if 'Journée' in match['Matchweek']:
        match['Matchweek'] = int(match['Matchweek'].replace('Journée ', ''))
    else:
        match['Matchweek'] = match['Matchweek'].split(")")[0]

    # ------- Season History ------- #
    for team_history in ['Home_Season_History', 'Away_Season_History']:
        if team_history in match:
            if isinstance(match[team_history], list):
                match.pop(team_history)
            else:
                if 'Match' in match[team_history]:
                    match.pop(team_history)

    # ------- Home Goals ------- #
    if len(match['Home_Goals']) > 1 : match['Home_Goals'] = int(match['Home_Goals'][0]) 

    # -------- Away Goals -------- #
    match['Away_Goals'] = int(match['Away_Goals'])

    # -------- Home Penalties -------- #
    if 'Home_Penalties' in match:
        if len(match['Home_Penalties']) > 1: match['Home_Penalties'] = int(match['Home_Penalties'][0])

    # -------- Away Penalties -------- #
    if 'Away_Penalties' in match:
        match['Away_Penalties'] = int(match['Away_Penalties'])

    # -------- Home xG -------- #
    if 'Home_xG' in match:
        if len(match['Home_xG']) > 1 : match['Home_xG'] = float(match['Home_xG'][0].replace(",", "."))

    # -------- Away xG -------- #
    if 'Away_xG' in match:
        match['Away_xG'] = float((match['Away_xG']).replace(',', '.'))

    # -------- Home Manager -------- #
    if 'Home_Manager' in match:
        if len(match['Home_Manager']) > 1 : match['Home_Manager'] = match['Home_Manager'][0].replace('Entraineur: ', '').replace('\xa0', ' ')

    # -------- Away Manager -------- #
    if 'Away_Manager' in match:
        match['Away_Manager'] = match['Away_Manager'].replace('Entraineur: ', '').replace('\xa0', ' ')

    # -------- Home Captain -------- #
    if 'Captain_Home' in match:
        if len(match['Captain_Home']) > 1 :
            match['Captain_Home'] = match['Captain_Home'][0].replace('\xa0', ' ')

    # -------- Away Captain -------- #
    if 'Captain_Away' in match:
        match['Captain_Away'] = match['Captain_Away'].replace('\xa0', ' ')

    # -------- Time -------- #
    if 'Time' in match:
        match['Time'] = match['Time'].replace(' (heure sur place)', '')

    # -------- Attendance / Stade / Arbitres -------- #
    if match['Affluence-Stade-Arbitres']:
        for el in match['Affluence-Stade-Arbitres']:
            if 'Affluence' in el:
                match['Affluence'] = int(el.replace('Affluence: ', '').replace(',',''))
            if 'Tribune' in el:
                match['Stade'] = el.replace('Tribune: ', '')
            if 'Officiels' in el :
                match['Arbitres'] = el.replace('Officiels: ', '').split('\xa0· ')
                match['Arbitres'] = [element.replace('\xa0', ' ') for element in match['Arbitres']]
                dict_arbitres = {}
                for official in match['Arbitres'] :
                    official = official.split(" (")
                    official[1] = official[1].replace(")", "")
                    dict_arbitres[official[1]] = official[0]
                match['Arbitres'] = dict_arbitres
        match.pop('Affluence-Stade-Arbitres')

    # -------- Lineup & Formation -------- #
    if 'Home_Lineup' in match and 'Away_Lineup' in match:
        lst_lineup = ['Home_Lineup', 'Away_Lineup']
        for el in lst_lineup:
            lineup = match[el].split('\n')
            lineup = [el for el in lineup if el != '']
            formation = re.split(r'\s+\(', lineup[0])[1].replace(')','')
            starting_list = re.findall(r'(\d+)([^\d]+)', lineup[1])
            substitute_list = re.findall(r'(\d+)([^\d]+)', lineup[3])
            starting = {}
            substitute = {}
            for num, name in starting_list: starting[num] = name
            for num, name in substitute_list: substitute[num] = name
            lineup = {
                'Formation' : formation,
                'Starting': starting,
                'Substitute': substitute
            }
            match[el] = lineup

    # -------- General Statistics -------- #
    if 'General_Statistics' in match:
        sublists = []
        general_stats = {}
        for i in range(0, len(match['General_Statistics']), 3):
            sublist = match['General_Statistics'][i:i+3]
            sublists.append(sublist) 
        for lst in sublists :
            general_stats['Home '+lst[1]] = int(lst[0])
            general_stats['Away '+lst[1]] = int(lst[2])
        match['General_Statistics'] = general_stats

    # -------- General Statistics 2 -------- #
    if 'General_Statistics_2' in match:
        sublists = []
        general_stats_2 = {}
        for i in range(0, len(match['General_Statistics_2']), 3):
            sublist = match['General_Statistics_2'][i:i+3]
            if "Cartons" not in sublist:
                sublists.append(sublist) 
        for lst in sublists :
            general_stats_2['Home '+lst[0]] = lst[1]
            general_stats_2['Away '+lst[0]] = lst[2]
        update_general_stats_2 = {}
        for key, value in general_stats_2.items():
            update_general_stats_2[key] = {}
            if "\xa0" in value:
                new_value = value.replace("\xa0", " ")
                new_value = new_value.replace("%", "")
                new_value = new_value.split(" — ")
                for el in new_value:
                    if "of" in el:
                        el = el.split(" of ")
                        update_general_stats_2[key]['Success'] = int(el[0])
                        update_general_stats_2[key]['Failed'] = int(el[1])
                    else:
                        if el == '':
                            update_general_stats_2[key]['Percentage'] = 0
                        else :
                            update_general_stats_2[key]['Percentage'] = int(el)
            else:
                new_value = int(value.replace("%", ""))
                update_general_stats_2[key] = new_value
        general_stats_2.update(update_general_stats_2)  
        if 'General_Statistics' not in match:
            match['General_Statistics'] = {}
        match['General_Statistics'] = {**match['General_Statistics'], **general_stats_2}
        del match['General_Statistics_2']
    if match['General_Statistics'] == {}:
        del match['General_Statistics']

    # -------- Events -------- #

    if 'Home_Events' in match and 'Away_Events' in match:
        for events in ['Home_Events', 'Away_Events'] :

            #search_strings = ["Penalty Miss", "Second Yellow Card", "Yellow Card", 
            #                "Penalty Kick", "Penalty saved", "Goal", "Red Card", "Substitute", "Own Goal",
            #                "injury"]

            list_events = match[events]
            list_events = [event.replace('\n', '').replace('\t', '').replace('\xa0', '') for event in list_events]
            sous_listes = [list_events[i:i+2] for i in range(0, len(list_events), 2)]
            dict_events = {}
            for el in sous_listes:
                if "’" in el[0]:
                    minute = el[0].split('’')[0]
                    if "+" in minute :
                        minute = minute.split("+")
                        minute = int(minute[0])+int(minute[1])
                    else:                
                        minute = int(el[0].split('’')[0])
                    score = el[0].split('’')[1]
                    type_event = el[1].split("—")[1]
                    player = el[1].split("—")[0]
                    dict_events[minute] = {'Score' : score, 'Event':type_event}
                    if "Yellow Card" or "Red Card" in type_event:
                        dict_events[minute]['Player'] = player
                    if "Goal" in type_event:
                        type_event = type_event.split(" ")[0]
                        dict_events[minute]['Event'] = type_event
                        dict_events[minute].pop('Player')
                    if "Passe décisive" in player:
                        buteur = player.split('Passe décisive:')[0]
                        passeur = player.split('Passe décisive:')[1]
                        dict_events[minute]['Buteur'] = buteur
                        dict_events[minute]['Passeur'] = passeur
                    if "pour" in player and 'Substitute' in type_event:
                        joueur_sortant = player.split('pour ')[1]
                        joueur_remplacant = player.split('pour ')[0]
                        dict_events[minute]['Sortant'] = joueur_sortant
                        dict_events[minute]['Remplacant'] = joueur_remplacant
                        dict_events[minute].pop('Player')
            match[events] = dict_events

    # -------- Penaltys -------- #
    teams = ['Home_Events_Penalties', 'Away_Events_Penalties']
    for team in teams:
        if team in match :
            is_first_iteration = True 
            score_precedent = ""
            dict_events_penalities = {}
            for el in match[team]:
                el = el.replace('\t','')
                el = el.split('\n\n\n\n\n\n')
                el[0] = el[0].replace('\n\n\n\xa0', '')    
                el[0] = el[0].split('\n')
                el[1] = el[1].split('\n\n\n\xa0')
                num_penaltie = int(el[0][0])
                score_penaltie = el[0][1]
                tireur_penaltie = el[1][0]
                if is_first_iteration:
                    if team == 'Home_Events_Penalties':
                        resultat = 'Manqué' if score_penaltie.split(':')[0] == '0' else 'Réussi'
                    else:
                        resultat = 'Manqué' if score_penaltie.split(':')[1] == '0' else 'Réussi'
                    is_first_iteration = False 
                    score_precedent = score_penaltie
                else:
                    if team == 'Home_Events_Penalties':
                        resultat = 'Manqué' if score_precedent.split(':')[0] == score_penaltie.split(':')[0] else 'Réussi'
                        score_precedent = score_penaltie
                    else:
                        resultat = 'Manqué' if score_precedent.split(':')[1] == score_penaltie.split(':')[1] else 'Réussi'
                        score_precedent = score_penaltie
                dict_events_penalities[num_penaltie] = {'Résultat':resultat,'Score_Penaltie':score_penaltie,'Tireur_Penaltie':tireur_penaltie}
            match[team] = dict_events_penalities

    return match

# Test

In [6]:
#url = "https://fbref.com/fr/matchs/8689fb39/Celta-Vigo-Osasuna-13-Aout-2023-La-Liga"
#url = "https://fbref.com/fr/matchs/d63657b1/Fulham-Norwich-City-18-Aout-2012-Premier-League"
#url = "https://fbref.com/fr/matchs/0192aa5d/Al-Hazem-Al-Ahli-19-Aout-2021-Saudi-Professional-League"
#url = "https://fbref.com/fr/matchs/7140acae/Argentina-France-18-Decembre-2022-FIFA-World-Cup"
#url = "https://fbref.com/fr/matchs/85c72a0a/France-Morocco-14-Decembre-2022-FIFA-World-Cup"
#url = "https://fbref.com/fr/matchs/f4fb0301/Liverpool-West-Ham-United-26-Aout-2006-Premier-League"
#url = "https://fbref.com/fr/matchs/c1e42359/Almeria-Real-Madrid-14-Aout-2022-La-Liga"
#url = "https://fbref.com/fr/matchs/6e2ac192/Norwich-City-Queens-Park-Rangers-25-Aout-2012-Premier-League"
#url = "https://fbref.com/fr/matchs/87fcc186/Paris-FC-US-Concarneau-26-Aout-2023-Ligue-2"
url = "https://fbref.com/fr/matchs/9b5a5b22/Manchester-City-Bayern-Munich-11-Avril-2023-Champions-League"
#url = "https://fbref.com/fr/matchs/3b8c18de/Bayern-Munich-Manchester-City-19-Avril-2023-Champions-League"
#url = "https://fbref.com/fr/matchs/1f3e596b/Annecy-Toulouse-6-Avril-2023-Coupe-de-France"
#url = "https://fbref.com/fr/matchs/33cc6a3a/Greuther-Furth-Paderborn-07-30-Juillet-2023-2-Bundesliga"

match = ScrapMatch(url)
CleanMatch(match)

{'Home_Team': 'Manchester City',
 'Away_Team': 'Bayern München',
 'Competition': 'Ligue des champions UEFA',
 'Matchweek': 'Quarts de finale',
 'Home_Goals': 3,
 'Away_Goals': 0,
 'Home_xG': 1.8,
 'Away_xG': 0.9,
 'Home_Manager': 'Capitaine: İlkay Gündoğan',
 'Away_Manager': 'Capitaine: Joshua Kimmich',
 'Captain_Home': 'İlkay Gündoğan',
 'Captain_Away': 'Joshua Kimmich',
 'Date': 'Mardi 11 Avril 2023',
 'Time': '20:00',
 'Home_Lineup': {'Formation': '3-2-4-1',
  'Starting': {'31': 'Ederson',
   '3': 'Rúben Dias',
   '5': 'John Stones',
   '6': 'Nathan Aké',
   '8': 'İlkay Gündoğan',
   '9': 'Erling Haaland',
   '10': 'Jack Grealish',
   '16': 'Rodri',
   '17': 'Kevin De Bruyne',
   '20': 'Bernardo Silva',
   '25': 'Manuel Akanji'},
  'Substitute': {'18': 'Stefan Ortega',
   '33': 'Scott Carson',
   '2': 'Kyle Walker',
   '4': 'Kalvin Phillips',
   '14': 'Aymeric Laporte',
   '19': 'Julián Álvarez',
   '21': 'Sergio Gómez',
   '26': 'Riyad Mahrez',
   '32': 'Máximo Perrone',
   '80': '