In [None]:
import pandas as pd
import requests
import json
import re
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

### 1. SCRAPE ALL CLUB LINKS

In [None]:
all_club_urls = {}

# this link is for PL, I updated it manually for five different leagues
url = "https://www.transfermarkt.us/premier-league/startseite/wettbewerb/GB1"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/122.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,"
            "image/avif,image/webp,image/apng,*/*;q=0.8,"
            "application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com/",
    "DNT": "1", 
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1"
}

session = requests.Session()
response = session.get(url, headers=headers)

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
anchors = soup.find_all('a', href=re.compile(r'/.*?/startseite/verein/\d+/saison_id/\d+'))
hrefs = list({a['href'] for a in anchors if a.has_attr('href')})

# Again, I updated the league name manually, example :- pl, league1, bundesliga etc
all_club_urls["pl"] = hrefs


In [None]:
with open ("transfermarkt/all_clubs_tfmkt.json", "w") as file:
    json.dump(all_club_urls, file, indent=2)

### 2. SCRAPE ALL PLAYERS LINK

In [None]:
league_players_link = {}

def scrape_players(league,hrefs):
    all_list = []
    for href in hrefs:
        url = f"https://www.transfermarkt.us{href}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/122.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,"
                    "image/avif,image/webp,image/apng,*/*;q=0.8,"
                    "application/signed-exchange;v=b3;q=0.7",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9",
            "Referer": "https://www.google.com/",
            "DNT": "1",  # Do Not Track
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1"
        }

        session = requests.Session()
        response = session.get(url, headers=headers)

        soup = BeautifulSoup(response.text, 'html.parser')
        anchors = soup.find_all('a', href=re.compile(r'/profil/spieler/'))
        refs = list({a['href'] for a in anchors if a.has_attr('href')})
        all_list += refs
        
        time.sleep(4)
    league_players_link[league] = all_list

In [None]:
for key,val in all_club_urls.items():
    scrape_players(key, all_club_urls[key])

In [None]:
with open("transfermarkt/all_players_link.json", "w") as file:
    json.dump(league_players_link, file, indent=2)

### 3. SCRAPE ALL PLAYERS DATA

In [None]:
fbref_df = pd.read_csv("fbref_final.csv")
all_fbref_players = list(fbref_df ["Name"])

player_unique_list = []
error_players_list = []
missed_players_list = []

new_columns = [
"Name",
"Age",
"Playing Stats",
"Market Value"
]
all_transfermarkt_df = pd.DataFrame(columns=new_columns)

In [None]:
def scrape_players_stats_selenium(link_set):
    global all_transfermarkt_df 

    for link in link_set:
        player_name = link.split("/")[1]
        player_name = [i.capitalize() for i in player_name.split("-")]
        player_name = " ".join(player_name)

        if player_name in all_fbref_players:
            if player_name in player_unique_list or player_name in missed_players_list:
                continue
            
            try:
                url = f"https://www.transfermarkt.us{link}"

                options = Options()
                options.add_argument("--headless=new")
                options.add_argument("--disable-gpu")
                options.add_argument("--window-size=1920,1080")
                options.add_argument("--no-sandbox")
                options.add_argument("--disable-dev-shm-usage")
                options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                                    "Chrome/122.0.0.0 Safari/537.36")

                driver = webdriver.Chrome(options=options)
                driver.get(url)
                time.sleep(5)

                soup = BeautifulSoup(driver.page_source, 'html.parser')

                # Extract Playing Stats
                minutes = soup.find_all(class_="percentage-value")
                minutes = [span.text.strip() for span in minutes]
                playing_stats = ",".join(minutes)

                # Extract Age
                age_tag = soup.find("span", class_="data-header__content", itemprop="birthDate")
                age = age_tag.text.strip() if age_tag else None

                # Extract Market Value
                market_tag = soup.find("a", class_="data-header__market-value-wrapper")
                market_val = market_tag.get_text(strip=True) if market_tag else None
                match = re.search(r"\d+\.\d+", market_val) if market_val else None
                market_val = float(match.group()) if match else None

                driver.quit()

                new_row = {
                    "Name": player_name,
                    "Age": age,
                    "Playing Stats": playing_stats,
                    "Market Value": market_val
                }
                all_transfermarkt_df = pd.concat(
                    [all_transfermarkt_df, pd.DataFrame([new_row])],
                    ignore_index=True
                )

                player_unique_list.append(player_name)
            
            except Exception as e:
                print ("Exception occured for player", player_name, e)
                error_players_list.append(player_name)

        else:
            missed_players_list.append(player_name)


In [None]:
with open("transfermarkt/all_players_link.json", "r") as f:
    league_players_link = json.load(f)


for key,val in league_players_link.items():
    scrape_players_stats_selenium(league_players_link[key])
    all_transfermarkt_df.to_csv(f"transfermarkt/{key}_players_tfmkt.csv")


### 4. CLEAN AND MERGE ALL PLAYERS DATA

In [None]:
new_columns = [
"Name",
"Age",
"Playing Stats",
"Market Value"
]
all_transfermarkt_df = pd.DataFrame(columns=new_columns)

In [None]:
global all_transfermarkt_df
for file in os.listdir("transfermarkt"):
    if not file.endswith(".csv"):
        continue

    league_name = file.replace(".csv","").split("_")[0]
    df = pd.read_csv("transfermarkt/" + file)
    all_transfermarkt_df = pd.concat([all_transfermarkt_df,df], ignore_index=True)


In [None]:
all_transfermarkt_df["Age"] = all_transfermarkt_df["Age"].apply(lambda x: x.split("(")[1][:2])
all_transfermarkt_df=all_transfermarkt_df.drop(columns=["Unnamed: 0"], axis=1)
all_transfermarkt_df[['Starting Eleven', 'Minutes', 'Goal Involvement']] = all_transfermarkt_df['Playing Stats'].str.split(',', expand=True)
all_transfermarkt_df = all_transfermarkt_df.drop(columns=["Playing Stats"], axis=1)
all_transfermarkt_df.head()

In [None]:
all_transfermarkt_df.to_csv("transfermarkt_final.csv")