In [57]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os
from collections import Counter
import urllib.parse


matches = pd.read_csv('players.csv', index_col=False)
base_url = 'https://fbref.com'
data_path = 'match_data.csv'
completed_matches_path = 'completed_matches.txt'
# Empty list to store the scraped data
match_data = []

In [59]:
def fetch_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        print(f"Failed to retrieve data from {url}")
        return None
    
def process_headers(headers):
    header_count = Counter(headers)
    new_headers = []
    header_indices = {}

    for header in headers:
        if header_count[header] > 1:
            if header in header_indices:
                header_indices[header] += 1
            else:
                header_indices[header] = 1
            new_header = f"{header}.{header_indices[header]}"
        else:
            new_header = header
        new_headers.append(new_header)
    
    return new_headers

In [62]:
# Load or initialize completed matches set
if os.path.exists(completed_matches_path):
    with open(completed_matches_path, 'r') as file:
        completed_matches = set(file.read().splitlines())
else:
    completed_matches = set()

# Load or initialize DataFrame
if os.path.exists(data_path):
    match_data = pd.read_csv(data_path)
else:
    match_data = pd.DataFrame()


In [65]:
for index, match in enumerate(matches['link']):
    match_url = f"{base_url}{match}"
    
    if match_url in completed_matches:
        print(f"Skipping completed match: {match_url}")
        continue
    
    if index % 20 == 0 and index != 0:
        print(f"Pausing for a minute to respect the rate limit...")
        time.sleep(60)  # Pause the execution for 60 seconds every 20 requests
    
    print(f"Fetching data for match: {match_url}")
    soup = fetch_data(match_url)
    
    if soup:
        table = soup.find('table', {'class': 'stats_table'})  # Example class name, adjust as necessary
        # Parsing table headers
        
        headers = ['Player Link']  
        for th in table.find('thead').find_all('th'):
            if 'over_header' not in th.find_parent('tr').get('class', []):
                headers.append(th.text.strip())
        headers = process_headers(headers)
        
        # Parsing rows
        rows = []
        for tr in table.find('tbody').find_all('tr'):
            # print(tr.find('th').text.strip())
            date = tr.find('th', {'data-stat': 'date'}).text.strip()
            cells = [td.text.strip() for td in tr.find_all('td')]
            cells.insert(0, date)
            cells.insert(0, match_url)
            rows.append(cells)

        # Creating DataFrame
        df = pd.DataFrame(rows, columns=headers)
        # Ensure both DataFrames have the same columns in the correct order
        
        match_data = pd.concat([match_data, df], ignore_index=False)

        # Save DataFrame and completed matches after each successful fetch
        match_data.to_csv(data_path, index=False)
        completed_matches.add(match_url)

        # Save completed matches
        with open(completed_matches_path, 'w') as file:
            file.write('\n'.join(completed_matches))

print("Data scraping completed.")

Skipping completed match: https://fbref.com/en/players/5f09991f/matchlogs/2023-2024/summary/Patrick-van-Aanholt-Match-Logs
Skipping completed match: https://fbref.com/en/players/4cd41883/matchlogs/2023-2024/summary/Paxten-Aaronson-Match-Logs
Skipping completed match: https://fbref.com/en/players/662ffe3d/matchlogs/2023-2024/summary/Jayden-Addai-Match-Logs
Skipping completed match: https://fbref.com/en/players/3ed84b2c/matchlogs/2023-2024/summary/Bobby-Adekanye-Match-Logs
Skipping completed match: https://fbref.com/en/players/be98fc34/matchlogs/2023-2024/summary/Shawn-Adewoye-Match-Logs
Skipping completed match: https://fbref.com/en/players/72a63c0b/matchlogs/2023-2024/summary/Nikolas-Agrafiotis-Match-Logs
Skipping completed match: https://fbref.com/en/players/e2b384d2/matchlogs/2023-2024/summary/Chuba-Akpom-Match-Logs
Skipping completed match: https://fbref.com/en/players/0b524c25/matchlogs/2023-2024/summary/Hamdi-Akujobi-Match-Logs
Skipping completed match: https://fbref.com/en/player

## Extract player name

In [66]:
def extract_player_name(url):
    # Parse the URL to get the path
    parsed_url = urllib.parse.urlparse(url)
    path = parsed_url.path
    
    # Split the path into components
    path_parts = path.split('/')
    
    # Get the last part which contains the name and "Match-Logs"
    name_part = path_parts[-1]
    
    # Replace dashes with spaces and remove "Match-Logs" from the name
    player_name = name_part.replace("-Match-Logs", "").replace("-", " ")
    return player_name

# Extract names from each URL in the list
match_data['Player Name'] = match_data['Player Link'].apply(extract_player_name)
match_data.to_csv('def_match_data.csv', index=False)


Unnamed: 0,Player Link,Date,Day,Comp,Round,Venue,Result,Squad,Opponent,Start,...,Cmp,Att,Cmp%,PrgP,Carries,PrgC,Att.1,Succ,Match Report,Att.2
0,https://fbref.com/en/players/5f09991f/matchlog...,2023-08-08,Tue,Champions Lg,Third qualifying round,Home,W 4–1,nl PSV Eindhoven,at Sturm Graz,Y,...,,,,,,,,,Match Report,
1,https://fbref.com/en/players/5f09991f/matchlog...,2023-08-12,Sat,Eredivisie,Matchweek 1,Home,W 2–0,PSV Eindhoven,Utrecht,Y,...,47.0,57.0,82.5,2.0,24.0,3.0,0.0,0.0,Match Report,
2,https://fbref.com/en/players/5f09991f/matchlog...,2023-08-15,Tue,Champions Lg,Third qualifying round,Away,W 3–1,nl PSV Eindhoven,at Sturm Graz,Y,...,,,,,,,,,Match Report,
3,https://fbref.com/en/players/5f09991f/matchlog...,2023-08-19,Sat,Eredivisie,Matchweek 2,Away,W 3–1,PSV Eindhoven,Vitesse,Y,...,28.0,33.0,84.8,3.0,17.0,2.0,0.0,0.0,Match Report,
4,https://fbref.com/en/players/5f09991f/matchlog...,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32,https://fbref.com/en/players/f3f80000/matchlog...,2024-03-30,Sat,Eredivisie,Matchweek 27,Home,W 3–0,Go Ahead Eag,Excelsior,Y,...,23,,76.7,9,20,2,30,0,Match Report,0
33,https://fbref.com/en/players/f3f80000/matchlog...,2024-04-04,Thu,Eredivisie,Matchweek 28,Away,D 1–1,Go Ahead Eag,Ajax,Y,...,28,,82.4,5,24,0,34,0,Match Report,0
34,https://fbref.com/en/players/f3f80000/matchlog...,2024-04-07,Sun,Eredivisie,Matchweek 29,Home,D 1–1,Go Ahead Eag,Almere City,Y,...,23,,69.7,2,9,1,33,1,Match Report,1
35,https://fbref.com/en/players/f3f80000/matchlog...,2024-04-14,Sun,Eredivisie,Matchweek 30,Away,L 1–2,Go Ahead Eag,Utrecht,Y,...,18,,66.7,4,27,2,27,0,Match Report,2
