### Code used to scrape FBREF

### 1. SCRAPE ALL LEAGUE TEAMS 

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import http.client
import re
import time
import json
import html5lib
import os

In [None]:
# Scrape all league teams
fbref_league_names = {
    "pl":[9, "Premier-League", 20],
    "seriea":[11, "Serie-A", 20],
    "laliga":[12, "La-Liga", 20],
    "bundesliga":[20, "Bundesliga", 18],
    "league1":[13, "Ligue-1", 18],
    "eredivisie": [23, "Eredivisie", 19]
}

fbref_scrapped = {}
def scrape_league_teams(code, name, number):

    url = f'https://fbref.com/en/comps/{code}/{name}-Stats'

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/122.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,"
                "image/avif,image/webp,image/apng,*/*;q=0.8,"
                "application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
        "DNT": "1", 
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    }

    session = requests.Session()
    response = session.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    anchors = soup.find_all('a', href=re.compile(r'^/en/squads/'))
    anchors = anchors[:number+1]
    fbref_scrapped[name] = anchors
    time.sleep(10)

In [None]:
## SCRAPING ALL THE LEAGUE TEAMS
for key,val in fbref_league_names.items():
    scrape_league_teams(val[0], val[1], val[2])

In [None]:
# SAVED ALL TEAMS URL TO A FILE
def extract_hrefs(data):
    result = {}
    for league, tags in data.items():
        hrefs = []
        for tag in tags:
            if hasattr(tag, 'get'):
                href = tag.get('href')
                if href:
                    hrefs.append(href)
        result[league] = hrefs
    return result

cleaned_data = extract_hrefs(fbref_scrapped)

# Write to file
with open('fbref_team_links.json', 'w') as f:
    json.dump(cleaned_data, f, indent=2)

### 2. SCRAPE ALL PLAYERS LINK FROM EACH TEAM

In [None]:
league_all_attackers = {}

def scrape_team_attackers(league_name,league_urls):
    all_attackers = []
    for team_url in league_urls:
        if team_url == "/en/squads/":
            continue
        
        url = f"https://fbref.com/{team_url}"
        
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/122.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,"
                    "image/avif,image/webp,image/apng,*/*;q=0.8,"
                    "application/signed-exchange;v=b3;q=0.7",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9",
            "Referer": "https://www.google.com/",
            "DNT": "1",  # Do Not Track
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1"
        }

        session = requests.Session()
        response = session.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        anchors = soup.find_all('a', href=re.compile(r'^/en/players/'))
        html = pd.read_html(url)
        df = html[0]
        df.columns = df.columns.get_level_values(-1)
        df = df[df["Pos"].str.contains("FW", na=False)]
        attackers_list = df["Player"].to_list()
        filtered_tags = [
            tag for tag in anchors
            if tag.text in attackers_list and "matchlogs" not in tag['href']
        ]

        all_attackers += filtered_tags

        time.sleep(10)

    league_all_attackers[league_name] = all_attackers

In [None]:
with open('fbref_team_links.json', 'r') as f:
    clubs_link_data = json.load(f)

for key,val in clubs_link_data:
    scrape_team_attackers(key, clubs_link_data[key])

In [None]:
cleaned_data = extract_hrefs(league_all_attackers)

with open('attackers/all_attackers.json', 'w') as f:
    json.dump(cleaned_data, f, indent=2)

### 3. SCRAPE PLAYERS STATS

In [None]:
def convert_urls_to_scouting(urls):
    base = "https://fbref.com"
    return [
        f"{base}{match.group(1)}/scout/365_m1/{match.group(2)}-Scouting-Report"
        for url in urls
        if (match := re.match(r"(/en/players/[\w\d]+)/([\w\-]+)", url))
    ]

folders = os.listdir("players")
players_name_list = []

for f in folders:
    players_name_list += [
            filename.replace(".csv", "")
            for filename in os.listdir(f"players/{f}")
            if filename.endswith(".csv")
        ]


In [None]:
def scrape_players_data(league,urls):
    urls = list(set(urls))  # remove duplicates
    urls_formatted = convert_urls_to_scouting(urls)
    
    for url in urls_formatted:
        last_part = url.rstrip('/').split("/")[-1]
        player_name = last_part.replace("-Scouting-Report", "").replace("-", " ")

        # if player_name not in rescrape:
        #     continue
        # if player_name in players_name_list:
        #     continue

        try:
            tables = pd.read_html(url)
            if len(tables) > 3:
                table = tables[-2]
            
            else:
                table = tables[2]
            table.to_csv(f"players/{league}/{player_name}.csv")
        except Exception as e:
            print(F"ERROR FETCHING {player_name} data", e)

        time.sleep(10)

In [None]:
with open('attackers/all_attackers.json', 'r') as f:
    players_link_data = json.load(f)

for key,val in players_link_data.items():
    scrape_players_data(key, players_link_data[key])

### 4. CLEAN AND MERGE ALL PLAYERS DATA

In [None]:
## THESE ARE THE METRICS I AM MOST INTERESTED IN
search_terms = [
    "Goals",
    "Assists",
    "Non-Penalty Goals",
    "xG: Expected Goals",
    "npxG: Non-Penalty xG", 
    "xAG: Exp. Assisted Goals",
    "Progressive Carries",
    "Progressive Passes",
    "Progressive Passes Rec",
    "Shots Total",
    "Shots on Target",
    "Goals/Shot",
    "npxG/Shot",
    "xA: Expected Assists",
    "Key Passes",
    "Through Balls",
    "Crosses",
    "Shot-Creating Actions",
    "Goal-Creating Actions",
    "Shots on Target",
    "Touches (Att 3rd)",
    "Touches (Att Pen)",
]

whole_df = pd.DataFrame(columns=["Name", "League"] + search_terms)
whole_df.head()

In [None]:
folders = os.listdir("players")
for folder in folders:
    files = os.listdir(f"players/{folder}")
    for file in files:
        df = pd.read_csv(f"players/{folder}/{file}")
        player_name = file.replace(".csv", "")
        
        player_data = {key: None for key in ["Name"] + search_terms}
        player_data["Name"] = player_name
        player_data["League"] = folder

        for term in search_terms:
            # mask = df.apply(lambda row: row.astype(str).str.contains(term, na=False)).any(axis=1)
            mask = df.apply(lambda row: row.astype(str).str.contains(term, case=False, na=False, regex=False)).any(axis=1)
            if any(mask):
                try:
                    first_row = df[mask].iloc[0]
                    row_val = first_row["Standard Stats.1"]
                    player_data[term] = row_val
                except Exception as e:
                    print(f"Error extracting '{term}' for {player_name}: {e}")
            else:
                print(f"{term} not found for {player_name}")

        new_row = pd.DataFrame([player_data], columns=whole_df.columns)
        whole_df = pd.concat([whole_df, new_row], ignore_index=True)

In [None]:
whole_df.to_csv("fbref_final.csv")