In [None]:
from bs4 import BeautifulSoup
import requests
import re
import csv

# Get all teams from the top 7 leagues in Europe from 2010 to 2023

leagues = [
    'https://www.transfermarkt.us/premier-league/startseite/wettbewerb/GB1',
    'https://www.transfermarkt.us/primera-division/startseite/wettbewerb/ES1',
    'https://www.transfermarkt.us/serie-a/startseite/wettbewerb/IT1',
    'https://www.transfermarkt.us/bundesliga/startseite/wettbewerb/L1',
    'https://www.transfermarkt.us/ligue-1/startseite/wettbewerb/FR1',
    'https://www.transfermarkt.us/eredivisie/startseite/wettbewerb/NL1',
    'https://www.transfermarkt.us/liga-nos/startseite/wettbewerb/PO1',
]

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

team_ids = set()
teams_file = 'teams.csv'


def get_teams(league_url, year):
    ext = f'/plus/?saison_id={year}'
    response = requests.get(league_url + ext, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    count = 0

    a_tags = soup.find_all('a', href=re.compile(r'/[^/]+/startseite/verein/\d+'))
    for tag in a_tags:
        actual_name = tag.get('title')
        href = tag.get('href')
        match = re.search(r'/(?P<name>[^/]+)/startseite/verein/(?P<id>\d+)', href)
        if match:
            name = match.group('name')
            team_id = match.group('id')
            if team_id not in team_ids:
                team_ids.add(team_id)
                writer.writerow([name, team_id, actual_name])
                count += 1
    return count

with open(teams_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['name', 'id', 'actual_name'])

    for league in leagues:
        for year in range(2010, 2024):
            match = re.search(r'/([^/]+)/startseite', league)
            if match:
                name = match.group(1)
            print(f'Getting teams for {name} in {year}')
            print(f'Found {get_teams(league, year)} teams')

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import csv

# Get all players from the top 7 leagues in Europe from 2010 to 2023

player_file = 'players.csv'
teams_file = 'teams.csv'
team_url = "https://www.transfermarkt.us/team_name/kader/verein/id"
player_ids = set()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def get_players(team_url, year):
    ext = f'/plus/0/galerie/0?saison_id={year}'
    response = requests.get(team_url + ext, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    a_tags = soup.find_all('a', href=re.compile(r'/[^/]+/profil/spieler/\d+'))
    count = 0

    for tag in a_tags:
        href = tag.get('href')
        match = re.search(r'/(?P<name>[^/]+)/profil/spieler/(?P<id>\d+)', href)
        actual_name = tag.text.strip()
        if match:
            name = match.group('name')
            player_id = match.group('id')
            if player_id not in player_ids:
                player_ids.add(player_id)
                writer.writerow([name, player_id, actual_name])
                count += 1
    return count

with open(teams_file, mode='r', newline='', encoding='utf-8') as read_file:
    reader = csv.reader(read_file)
    next(reader)
    with open(player_file, mode='w', newline='', encoding='utf-8') as write_file:
        writer = csv.writer(write_file)
        writer.writerow(['name', 'id', 'actual_name'])
        for row in reader:
            team_name = row[0]
            team_id = row[1]
            for year in range(2010, 2024):
                print(f'Getting players for {team_name} in {year}')
                print(f'Found {get_players(team_url.replace('team_name', team_name).replace('id', team_id), year)} players')



In [None]:
import csv
import requests
import re

# Get all player transfers from 2010 to 2023

teams_file = 'teams.csv'
players_file = 'players.csv'
player_transfers_file = 'player_transfers.csv'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
teams = {}

def get_team_ids():
    with open(teams_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            teams[row[1]] = row[0]
    print(f'Loaded {len(teams)} teams')

def get_player_transfers(player_id):
    url = f'https://www.transfermarkt.com/ceapi/transferHistory/list/{player_id}'
    response = requests.get(url, headers=headers)
    transfers = response.json()['transfers']
    count = 0
    for transfer in transfers:
        match = re.search(r'/(?P<name>[^/]+)\/transfers/verein/(?P<id>\d+)/saison_id/\d+', transfer['from']['href'])
        if match:
            from_id = match.group('id')
            from_name = match.group('name')
        match = re.search(r'/(?P<name>[^/]+)\/transfers/verein/(?P<id>\d+)/saison_id/\d+', transfer['to']['href'])
        if match:
            to_id = match.group('id')
            to_name = match.group('name')
        date = transfer['dateUnformatted']
        if from_id not in teams:
            print(f'Adding team {from_id} {from_name}')
            teams[from_id] = from_name
            with open(teams_file, mode='a', newline='', encoding='utf-8') as file:
                writer_teams = csv.writer(file)
                writer_teams.writerow([from_name, from_id])
        if to_id not in teams:
            print(f'Adding team {to_id} {to_name}')
            teams[to_id] = to_name
            with open(teams_file, mode='a', newline='', encoding='utf-8') as file:
                writer_teams = csv.writer(file)
                writer_teams.writerow([to_name, to_id])
        writer.writerow([player_id, from_id, to_id, date])
        count += 1
    return count
        
    

get_team_ids()
with open(players_file, mode='r', newline='') as file:
    reader = csv.reader(file)
    next(reader)
    with open(player_transfers_file, mode='w', newline='') as write_file:
        writer = csv.writer(write_file)
        writer.writerow(['player_id', 'from_id', 'to_id', 'date'])
        for row in reader:
            player_name = row[0]
            player_id = row[1]
            print(f'Found {get_player_transfers(player_id)} transfers for {player_name}')



In [None]:
import csv
import aiohttp
import asyncio

# Check if a player is a known player

value_threshold = {
    '2010': 30, '2011': 30, '2012': 35, '2013': 35, '2014': 40, '2015': 40,
    '2016': 45, '2017': 45, '2018': 50, '2019': 50, '2020': 55, '2021': 60,
    '2022': 65, '2023': 70, '2024': 75
}

extra_players = {
    79422: 'Keylor Navas',
    29260: 'Giorgio Chiellini'
}

async def is_endpoint(session, player_id, player_name):
    url = f'https://www.transfermarkt.com/ceapi/marketValueDevelopment/graph/{player_id}'
    try:
        async with session.get(url) as response:
            data = await response.json()
            highest_market_value = data['highest']
            if highest_market_value[-1:] != 'm':
                return False
            highest_market_value = float(highest_market_value[1:-1])
            highest_market_value_date = data['highest_date']
            year = highest_market_value_date[-4:]
            if year not in value_threshold:
                year = '2010'
            return value_threshold[year] <= highest_market_value
    except Exception as e:
        print(f'Error processing {player_name} ({player_id}): {e}')
        return False

def chunk_list(data, size):
    for i in range(0, len(data), size):
        yield data[i:i + size]

async def load_players_and_check_endpoints():
    async with aiohttp.ClientSession() as session:
        players_to_process = []
        with open('players.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            for row in reader:
                players_to_process.append((row[1], row[2]))

        with open('endpoints.csv', mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['name', 'id'])
            count = 0
            batch_count = 0
            for batch in chunk_list(players_to_process, 100):
                tasks = [asyncio.create_task(is_endpoint(session, id, name)) for id, name in batch]
                results = await asyncio.gather(*tasks)
                for result, (id, name) in zip(results, batch):
                    if result:
                        writer.writerow([name, id])
                        count += 1
                batch_count += len(batch)
                print(f'Processed {batch_count} players, found {count} endpoints')
            
            # Process extra players
            for id, name in extra_players.items():
                if await is_endpoint(session, id, name):
                    writer.writerow([name, id])
                    count += 1
                    print(f'{name} {id} is an endpoint')
            
            print(f'Total endpoints found: {count}')
        print('Processing complete.')

loop = asyncio.get_event_loop()

if not loop.is_running():
    loop.run_until_complete(load_players_and_check_endpoints())
else:
    task = loop.create_task(load_players_and_check_endpoints())
