In [5]:
from bs4 import BeautifulSoup
import requests
import re
import csv

# Get all teams from the top 5 leagues and some extra leagues

player_leagues = [
    'https://www.transfermarkt.us/premier-league/startseite/wettbewerb/GB1',
    'https://www.transfermarkt.us/primera-division/startseite/wettbewerb/ES1',
    'https://www.transfermarkt.us/serie-a/startseite/wettbewerb/IT1',
    'https://www.transfermarkt.us/bundesliga/startseite/wettbewerb/L1',
    'https://www.transfermarkt.us/ligue-1/startseite/wettbewerb/FR1',
]

extra_leagues = [
    'https://www.transfermarkt.us/liga-nos/startseite/wettbewerb/PO1',
    'https://www.transfermarkt.us/super-lig/startseite/wettbewerb/TR1',
    'https://www.transfermarkt.us/eredivisie/startseite/wettbewerb/NL1',
    'https://www.transfermarkt.us/jupiler-pro-league/startseite/wettbewerb/BE1',
    'https://www.transfermarkt.us/major-league-soccer/startseite/wettbewerb/MLS1',
    'https://www.transfermarkt.us/campeonato-brasileiro-serie-a/startseite/wettbewerb/BRA1',
    'https://www.transfermarkt.us/superliga/startseite/wettbewerb/AR1N',
    'https://www.transfermarkt.us/saudi-professional-league/startseite/wettbewerb/SA1'
]

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

team_ids = set()
player_teams_file = 'player_teams.csv'
all_teams_file = 'teams.csv'
teams = []


def get_teams(league_url, year):
    ext = f'/plus/?saison_id={year}'
    response = requests.get(league_url + ext, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    count = 0

    a_tags = soup.find_all('a', href=re.compile(r'/[^/]+/startseite/verein/\d+'))
    for tag in a_tags:
        actual_name = tag.get('title')
        href = tag.get('href')
        match = re.search(r'/(?P<name>[^/]+)/startseite/verein/(?P<id>\d+)', href)
        if match:
            name = match.group('name')
            team_id = match.group('id')
            if team_id not in team_ids:
                team_ids.add(team_id)
                teams.append([name, team_id, actual_name])
                writer.writerow([name, team_id, actual_name])
                count += 1
    return count

with open(player_teams_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['name', 'id', 'actual_name'])

    for league in player_leagues:
        for year in range(2020, 2024):
            match = re.search(r'/([^/]+)/startseite', league)
            if match:
                name = match.group(1)
            print(f'Getting teams for {name} in {year}')
            print(f'Found {get_teams(league, year)} teams')

print('Getting player teams done')
    
with open(all_teams_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['name', 'id', 'actual_name'])

    for team in teams:
        writer.writerow(team)
    
    for league in extra_leagues:
        for year in range(2023, 2024):
            match = re.search(r'/([^/]+)/startseite', league)
            if match:
                name = match.group(1)
            print(f'Getting teams for {name} in {year}')
            print(f'Found {get_teams(league, year)} teams')

print('Getting all teams done')

Getting teams for premier-league in 2020
Found 20 teams
Getting teams for premier-league in 2021
Found 3 teams
Getting teams for premier-league in 2022
Found 2 teams
Getting teams for premier-league in 2023
Found 1 teams
Getting teams for primera-division in 2020
Found 20 teams
Getting teams for primera-division in 2021
Found 3 teams
Getting teams for primera-division in 2022
Found 2 teams
Getting teams for primera-division in 2023
Found 1 teams
Getting teams for serie-a in 2020
Found 20 teams
Getting teams for serie-a in 2021
Found 3 teams
Getting teams for serie-a in 2022
Found 3 teams
Getting teams for serie-a in 2023
Found 1 teams
Getting teams for bundesliga in 2020
Found 18 teams
Getting teams for bundesliga in 2021
Found 2 teams
Getting teams for bundesliga in 2022
Found 0 teams
Getting teams for bundesliga in 2023
Found 2 teams
Getting teams for ligue-1 in 2020
Found 20 teams
Getting teams for ligue-1 in 2021
Found 2 teams
Getting teams for ligue-1 in 2022
Found 3 teams
Getting

In [6]:
from bs4 import BeautifulSoup
import requests
import re
import csv

# Get all players from the top 5 league teams

player_file = 'players.csv'
teams_file = 'player_teams.csv'
team_url = "https://www.transfermarkt.us/team_name/kader/verein/id"
player_ids = set()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def get_players(team_url, year):
    ext = f'/plus/0/galerie/0?saison_id={year}'
    response = requests.get(team_url + ext, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    a_tags = soup.find_all('a', href=re.compile(r'/[^/]+/profil/spieler/\d+'))
    count = 0

    for tag in a_tags:
        href = tag.get('href')
        match = re.search(r'/(?P<name>[^/]+)/profil/spieler/(?P<id>\d+)', href)
        actual_name = tag.text.strip()
        if match:
            name = match.group('name')
            player_id = match.group('id')
            if player_id not in player_ids:
                player_ids.add(player_id)
                writer.writerow([name, player_id, actual_name])
                count += 1
    return count

with open(teams_file, mode='r', newline='', encoding='utf-8') as read_file:
    reader = csv.reader(read_file)
    next(reader)
    with open(player_file, mode='w', newline='', encoding='utf-8') as write_file:
        writer = csv.writer(write_file)
        writer.writerow(['name', 'id', 'actual_name'])
        for row in reader:
            team_name = row[0]
            team_id = row[1]
            for year in range(2010, 2024):
                print(f'Getting players for {team_name} in {year}')
                print(f'Found {get_players(team_url.replace('team_name', team_name).replace('id', team_id), year)} players')



Getting players for manchester-city in 2010
Found 45 players
Getting players for manchester-city in 2011
Found 12 players
Getting players for manchester-city in 2012
Found 11 players
Getting players for manchester-city in 2013
Found 6 players
Getting players for manchester-city in 2014
Found 13 players
Getting players for manchester-city in 2015
Found 16 players
Getting players for manchester-city in 2016
Found 10 players
Getting players for manchester-city in 2017
Found 10 players
Getting players for manchester-city in 2018
Found 18 players
Getting players for manchester-city in 2019
Found 8 players
Getting players for manchester-city in 2020
Found 7 players
Getting players for manchester-city in 2021
Found 12 players
Getting players for manchester-city in 2022
Found 11 players
Getting players for manchester-city in 2023
Found 5 players
Getting players for manchester-united in 2010
Found 42 players
Getting players for manchester-united in 2011
Found 11 players
Getting players for manc

In [None]:
import csv
import aiohttp
import asyncio

# Check if a player is a known player

value_threshold = {
    '2010': 30, '2011': 30, '2012': 35, '2013': 35, '2014': 40, '2015': 40,
    '2016': 45, '2017': 45, '2018': 50, '2019': 50, '2020': 55, '2021': 60,
    '2022': 65, '2023': 70, '2024': 75
}

extra_players = {
    79422: 'Keylor Navas',
    29260: 'Giorgio Chiellini'
}

async def is_endpoint(session, player_id, player_name):
    url = f'https://www.transfermarkt.com/ceapi/marketValueDevelopment/graph/{player_id}'
    try:
        async with session.get(url) as response:
            data = await response.json()
            highest_market_value = data['highest']
            if highest_market_value[-1:] != 'm':
                return False
            highest_market_value = float(highest_market_value[1:-1])
            highest_market_value_date = data['highest_date']
            year = highest_market_value_date[-4:]
            if year not in value_threshold:
                year = '2010'
            return value_threshold[year] <= highest_market_value
    except Exception as e:
        print(f'Error processing {player_name} ({player_id}): {e}')
        return False

def chunk_list(data, size):
    for i in range(0, len(data), size):
        yield data[i:i + size]

async def load_players_and_check_endpoints():
    async with aiohttp.ClientSession() as session:
        players_to_process = []
        with open('players.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            for row in reader:
                players_to_process.append((row[1], row[2]))

        with open('endpoints.csv', mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['name', 'id'])
            count = 0
            batch_count = 0
            for batch in chunk_list(players_to_process, 100):
                tasks = [asyncio.create_task(is_endpoint(session, id, name)) for id, name in batch]
                results = await asyncio.gather(*tasks)
                for result, (id, name) in zip(results, batch):
                    if result:
                        writer.writerow([name, id])
                        count += 1
                batch_count += len(batch)
                print(f'Processed {batch_count} players, found {count} endpoints')
            
            # Process extra players
            for id, name in extra_players.items():
                writer.writerow([name, id])
                count += 1
                print(f'{name} {id} is an endpoint')
            
            print(f'Total endpoints found: {count}')
        print('Processing complete.')

loop = asyncio.get_event_loop()

if not loop.is_running():
    loop.run_until_complete(load_players_and_check_endpoints())
else:
    task = loop.create_task(load_players_and_check_endpoints())


In [7]:
import csv
import aiohttp
import asyncio
import re

players_file = 'players.csv'
player_transfers_file = 'player_transfers.csv'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def chunk_list(data, size):
    for i in range(0, len(data), size):
        yield data[i:i + size]

async def get_player_transfers(session, player_id, writer):
    url = f'https://www.transfermarkt.com/ceapi/transferHistory/list/{player_id}'
    count = 0
    try:
        async with session.get(url, headers=headers) as response:
            transfers = await response.json()
            for transfer in transfers['transfers']:
                from_match = re.search(r'/(?P<name>[^/]+)\/transfers/verein/(?P<id>\d+)/saison_id/\d+', transfer['from']['href'])
                to_match = re.search(r'/(?P<name>[^/]+)\/transfers/verein/(?P<id>\d+)/saison_id/\d+', transfer['to']['href'])
                if from_match and to_match:
                    from_id = from_match.group('id')
                    to_id = to_match.group('id')
                    date = transfer['dateUnformatted']
                    writer.writerow([player_id, from_id, to_id, date])
                    count += 1
    except Exception as e:
        print(f"Error fetching transfers for player {player_id}: {e}")
    return count


async def process_players():
    async with aiohttp.ClientSession() as session:
        players_to_process = []
        with open('players.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            for row in reader:
                players_to_process.append((row[1], row[2]))
        
        with open('player_transfers.csv', mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['player_id', 'from_id', 'to_id', 'date'])
            total_transfers = 0
            batch_count = 0
            for batch in chunk_list(players_to_process, 100):
                tasks = [get_player_transfers(session, player_id, writer) for player_id, _ in batch]
                results = await asyncio.gather(*tasks)
                print(f"Processed batch: {sum(results)} transfers")
                total_transfers += sum(results)
                batch_count += 100
                print(f'Processed {batch_count} players')
        
        print(f'Total transfers found: {total_transfers}')
        print('Processing complete.')

loop = asyncio.get_event_loop()

if not loop.is_running():
    loop.run_until_complete(process_players())
else:
    task = loop.create_task(process_players())


Processed batch: 1262 transfers
Processed 100 players
Processed batch: 888 transfers
Processed 200 players
Processed batch: 1148 transfers
Processed 300 players
Processed batch: 951 transfers
Processed 400 players
Processed batch: 1143 transfers
Processed 500 players
Processed batch: 1008 transfers
Processed 600 players
Processed batch: 1211 transfers
Processed 700 players
Processed batch: 1186 transfers
Processed 800 players
Processed batch: 905 transfers
Processed 900 players
Processed batch: 1200 transfers
Processed 1000 players
Processed batch: 911 transfers
Processed 1100 players
Processed batch: 1146 transfers
Processed 1200 players
Processed batch: 1510 transfers
Processed 1300 players
Processed batch: 994 transfers
Processed 1400 players
Processed batch: 1631 transfers
Processed 1500 players
Processed batch: 1012 transfers
Processed 1600 players
Processed batch: 1204 transfers
Processed 1700 players
Processed batch: 1028 transfers
Processed 1800 players
Processed batch: 1309 tr

In [8]:
import csv

teams_file = 'teams.csv'
teams = set()

def read_teams():
    with open(teams_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            teams.add(row[1])
    print(f'Loaded {len(teams)} teams')


def remove_unwanted_transfers():
    with open('player_transfers.csv', mode='r', newline='', encoding='utf-8') as read_file:
        reader = csv.reader(read_file)
        next(reader)
        with open('filtered_transfers.csv', mode='w', newline='', encoding='utf-8') as write_file:
            writer = csv.writer(write_file)
            writer.writerow(['player_id', 'from_id', 'to_id', 'date'])
            total_transfers = 0
            filtered_transfers = 0
            for row in reader:
                if row[1] in teams and row[2] in teams:
                    writer.writerow(row)
                    filtered_transfers += 1
                elif row[1] in teams:
                    row[2] = '123'
                    writer.writerow(row)
                    filtered_transfers += 1
                elif row[2] in teams:
                    row[1] = '123'
                    writer.writerow(row)
                    filtered_transfers += 1
                total_transfers += 1
                if total_transfers % 1000 == 0:
                    print(f'Current progress: {filtered_transfers} / {total_transfers}')


            
            print(f'{filtered_transfers} / {total_transfers} transfers kept')
            print('Processing complete.')

read_teams()
remove_unwanted_transfers()
            

Loaded 294 teams
Current progress: 649 / 1000
Current progress: 1209 / 2000
Current progress: 1806 / 3000
Current progress: 2414 / 4000
Current progress: 2945 / 5000
Current progress: 3468 / 6000
Current progress: 4088 / 7000
Current progress: 4708 / 8000
Current progress: 5313 / 9000
Current progress: 5849 / 10000
Current progress: 6397 / 11000
Current progress: 6876 / 12000
Current progress: 7368 / 13000
Current progress: 7881 / 14000
Current progress: 8315 / 15000
Current progress: 8849 / 16000
Current progress: 9311 / 17000
Current progress: 9789 / 18000
Current progress: 10346 / 19000
Current progress: 10774 / 20000
Current progress: 11252 / 21000
Current progress: 11769 / 22000
Current progress: 12178 / 23000
Current progress: 12633 / 24000
Current progress: 13186 / 25000
Current progress: 13658 / 26000
Current progress: 14237 / 27000
Current progress: 14755 / 28000
Current progress: 15232 / 29000
Current progress: 15633 / 30000
Current progress: 16108 / 31000
Current progress: 1

In [17]:
import csv

filtered_transfers_file = 'filtered_transfers.csv'
final_transfers_file = 'final_transfers.csv'
NULL = 'NULL'

with open(file=filtered_transfers_file, mode='r', newline='', encoding='utf-8') as read_file:
    reader = csv.reader(read_file)
    next(reader)
    with open(file=final_transfers_file, mode='w', newline='', encoding='utf-8') as write_file:
        writer = csv.writer(write_file)
        writer.writerow(['player_id', 'team_id', 'from_date', 'to_date'])
        current_player = None
        old_date = None
        for row in reader:
            player_id = row[0]
            to_team_id = row[2]
            date = row[3]
            if current_player != player_id:
                current_player = player_id
                if to_team_id != '123':
                    writer.writerow([current_player, to_team_id, date, NULL])
            else:
                if to_team_id != '123':
                    writer.writerow([current_player, to_team_id, date, old_date])
            old_date = date
        
print('Processing complete')

            

Processing complete
