In [1]:
from bs4 import BeautifulSoup
import re
import requests
import pandas as pd
from constants import *
from datetime import datetime, timedelta
from PIL import Image
import os
from dataclasses import dataclass, fields

In [5]:
def get_cutoff_date(date_str: str) -> str:
    """Takes first matchdate datestring, returns next cutoff datestring for market value scraping"""
    date = datetime.strptime(date_str, "%b %d, %Y")
    while True:
        if date.day == 1 or date.day == 15:
            return date.strftime("%Y-%m-%d")
        else:
            date += timedelta(days=1)
            
def parse_market_value(market_value: str) -> int:
    """Takes market value string and converts it to numeric value"""
    if market_value == "-": return 0
    match = re.match(r"€(?P<value>[.0-9]+)(?P<multiplier>k|m|bn)", market_value)
    multiplier = match.group('multiplier')
    if multiplier == 'bn':
        return float(match.group('value')) * pow(10, 9)
    elif multiplier == 'm':
        return float(match.group('value')) * pow(10, 6)
    elif multiplier == 'k':
        return float(match.group('value')) * pow(10, 3)
    return 0

@dataclass
class MktVal:
    squad: str
    squad_size: int
    foreigners: int
    squad_value_avg: str
    squad_value: str
        
    def __post_init__(self):
        for field in fields(self):
            if field.type is str:
                value = getattr(self, field.name)
                if value:
                    setattr(self, field.name, value.strip())

COMP_LIST = [
    ('ENG1', "GB1", "premier-league"),
    ('ENG2', "GB2", "championship"),
    ('SPA1', "ES1", "laliga"),
    ('SPA2', "ES2", "laliga2"),
    ('ITA1', "IT1", "serie-a"),
    ('ITA2', "IT2", "serie-b"),
    ('GER1', "L1", "bundesliga"),
    ('GER2', "L2", "2-bundesliga"),
    ('FRA1', "FR1", "ligue-1"),
    ('FRA2', "FR2", "ligue-2"),
]

# Go to https://www.whatismybrowser.com/detect/what-http-headers-is-my-browser-sending
# Find "User-Agent"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
club_icon_urls = {}

for country, comp_id, comp_name in COMP_LIST:
    for season_start in range(LAST_SEASON_START_YEAR, FIRST_SEASON_START_YEAR-1, -1):
        try:
#     #         Get earliest after matchday 1 (i.e. closest to end of summer transfer window)
#     #             Get site content (matchday=1)
#             response_season_start = requests.get(
#                 f"https://www.transfermarkt.com/{comp_name}/spieltagtabelle/wettbewerb/{comp_id}?saison_id={season_start}&spieltag=1",
#                 headers=headers
#             )
#     #             Scrape first matchday date
#     #             Then get first date after matchday 1 closest to one of the options (first, 15th, last of each month)
#             soup = BeautifulSoup(re.compile("<!--|-->").sub("", response_season_start.text),'lxml')
#             cutoff_date = get_cutoff_date(soup.find_all("a", text=re.compile("\w{3} \d{1,2}, \d{4}"))[0].string.strip())
            
#             Scrape squad value data
#             response_squad_value = requests.get(
#                 f"https://www.transfermarkt.com/{comp_name}/marktwerteverein/wettbewerb/{comp_id}/stichtag/{cutoff_date}/plus/1",
#                 headers=headers
#             )
            response_squad_value = requests.get(
                f"https://www.transfermarkt.com/{comp_name}/startseite/wettbewerb/{comp_id}/saison_id/{season_start}",
                headers=headers
            )
            soup = BeautifulSoup(response_squad_value.content, "html.parser")
            df_data = []
            for row in soup.find("table", class_="items").select("tr")[2:]:
    #                 Get selected rows
                entry = MktVal(*[x.text for idx, x in enumerate(row.find_all("td")) if not idx in [0,3]])
                df_data.append(entry)
            
#                 Download club logos (if not exists)
                if not entry.squad in club_icon_urls.keys():
                    try:
                        club_icon_urls[entry.squad] = "https://www.transfermarkt.com" + \
                            row.find('a', {"title": entry.squad})['href']
                    except Exception as e:
                        print(f"{entry.squad} page not found")

    #             Save dataframe as csv
            df = pd.DataFrame(df_data, columns=[field.name for field in fields(MktVal)])
            df[['squad_value_avg','squad_value']] = df[['squad_value_avg','squad_value']].applymap(parse_market_value)
            df.to_csv(
                f"../data/squad_values/raw/{country}{season_start}_squad_values_{season_start+1}.csv",
                index=False
            )
            print(f"Saved successfully: {country} {season_start}-{season_start+1}")
        except Exception as e:
            print(f"Unable to get data for {country} {season_start}-{season_start+1}: {e}")
            
# Download club logos (if not exists)
for club_name, club_page in club_icon_urls.items():
    image_path = f"../data/club_icons/{club_name}.png"
    if not os.path.exists(image_path):
        try:
            response_squad_value = requests.get(club_page, headers=headers)
#             Go to club's page
            soup = BeautifulSoup(response_squad_value.content, "html.parser")
#             Find club's logo and download it
            Image.open(requests.get(soup.find('img', {"title": club_name})['src'], stream=True).raw).save(image_path)
        except Exception as e:
            print(f"Unable to download image: {e}")
        

Saved successfully: ENG1 2023-2024
Saved successfully: ENG1 2022-2023
Saved successfully: ENG1 2021-2022
Saved successfully: ENG1 2020-2021
Saved successfully: ENG1 2019-2020
Saved successfully: ENG1 2018-2019
Saved successfully: ENG2 2023-2024
Saved successfully: ENG2 2022-2023
Saved successfully: ENG2 2021-2022
Saved successfully: ENG2 2020-2021
Saved successfully: ENG2 2019-2020
Saved successfully: ENG2 2018-2019
Saved successfully: SPA1 2023-2024
Saved successfully: SPA1 2022-2023
Saved successfully: SPA1 2021-2022
Saved successfully: SPA1 2020-2021
Saved successfully: SPA1 2019-2020
Saved successfully: SPA1 2018-2019
Saved successfully: SPA2 2023-2024
Saved successfully: SPA2 2022-2023
Saved successfully: SPA2 2021-2022
Saved successfully: SPA2 2020-2021
Saved successfully: SPA2 2019-2020
Saved successfully: SPA2 2018-2019
Saved successfully: ITA1 2023-2024
Saved successfully: ITA1 2022-2023
Saved successfully: ITA1 2021-2022
Saved successfully: ITA1 2020-2021
Saved successfully: 

In [4]:
# Download club logos (if not exists)
for club_name, club_page in club_icon_urls.items():
    image_path = f"../data/club_icons/{club_name}.png"
    if not os.path.exists(image_path):
        try:
            response_squad_value = requests.get(club_page, headers=headers)
#             Go to club's page
            soup = BeautifulSoup(response_squad_value.content, "html.parser")
#             Find club's logo and download it
            Image.open(requests.get(soup.find('img', {"title": club_name})['src'], stream=True).raw).save(image_path)
        except Exception as e:
            print(f"Unable to download image: {e}")

Unable to download image: 'NoneType' object is not subscriptable
Unable to download image: 'NoneType' object is not subscriptable
Unable to download image: 'NoneType' object is not subscriptable
Unable to download image: 'NoneType' object is not subscriptable
Unable to download image: 'NoneType' object is not subscriptable
Unable to download image: 'NoneType' object is not subscriptable
Unable to download image: 'NoneType' object is not subscriptable
Unable to download image: 'NoneType' object is not subscriptable
Unable to download image: 'NoneType' object is not subscriptable
Unable to download image: 'NoneType' object is not subscriptable
