In [None]:
from bs4 import BeautifulSoup
import requests
import re
import csv

# Get all teams from the top 5 leagues and some extra leagues

player_leagues = [
    'https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1',
    'https://www.transfermarkt.com/primera-division/startseite/wettbewerb/ES1',
    'https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1',
    'https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/L1',
    'https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1',
]

extra_leagues = [
    'https://www.transfermarkt.com/liga-nos/startseite/wettbewerb/PO1',
    'https://www.transfermarkt.com/super-lig/startseite/wettbewerb/TR1',
    'https://www.transfermarkt.com/eredivisie/startseite/wettbewerb/NL1',
    'https://www.transfermarkt.com/jupiler-pro-league/startseite/wettbewerb/BE1',
    'https://www.transfermarkt.com/major-league-soccer/startseite/wettbewerb/MLS1',
    'https://www.transfermarkt.com/campeonato-brasileiro-serie-a/startseite/wettbewerb/BRA1',
    'https://www.transfermarkt.com/superliga/startseite/wettbewerb/AR1N',
    'https://www.transfermarkt.com/saudi-professional-league/startseite/wettbewerb/SA1'
]

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

team_ids = set()
player_teams_file = 'player_teams.csv'
all_teams_file = 'teams.csv'
teams = []


def get_teams(league_url, year):
    ext = f'/plus/?saison_id={year}'
    response = requests.get(league_url + ext, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    count = 0

    a_tags = soup.find_all('a', href=re.compile(r'/[^/]+/startseite/verein/\d+'))
    for tag in a_tags:
        actual_name = tag.get('title')
        href = tag.get('href')
        match = re.search(r'/(?P<name>[^/]+)/startseite/verein/(?P<id>\d+)', href)
        if match:
            name = match.group('name')
            team_id = match.group('id')
            if team_id not in team_ids:
                team_ids.add(team_id)
                teams.append([name, team_id, actual_name])
                writer.writerow([name, team_id, actual_name])
                count += 1
    return count

with open(player_teams_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['name', 'id', 'actual_name'])

    for league in player_leagues:
        for year in range(2020, 2024):
            match = re.search(r'/([^/]+)/startseite', league)
            if match:
                name = match.group(1)
            print(f'Getting teams for {name} in {year}')
            print(f'Found {get_teams(league, year)} teams')

print('Getting player teams done')
    
with open(all_teams_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['name', 'id', 'actual_name'])

    for team in teams:
        writer.writerow(team)
    
    for league in extra_leagues:
        for year in range(2023, 2024):
            match = re.search(r'/([^/]+)/startseite', league)
            if match:
                name = match.group(1)
            print(f'Getting teams for {name} in {year}')
            print(f'Found {get_teams(league, year)} teams')

print('Getting all teams done')

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import csv

# Get all players from the top 5 league teams

player_file = 'players.csv'
teams_file = 'player_teams.csv'
team_url = "https://www.transfermarkt.com/team_name/kader/verein/id"
player_ids = set()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def get_players(team_url, year):
    ext = f'/plus/0/galerie/0?saison_id={year}'
    response = requests.get(team_url + ext, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    a_tags = soup.find_all('a', href=re.compile(r'/[^/]+/profil/spieler/\d+'))
    count = 0

    for tag in a_tags:
        href = tag.get('href')
        match = re.search(r'/(?P<name>[^/]+)/profil/spieler/(?P<id>\d+)', href)
        actual_name = tag.text.strip()
        if match:
            name = match.group('name')
            player_id = match.group('id')
            if player_id not in player_ids:
                player_ids.add(player_id)
                writer.writerow([name, player_id, actual_name])
                count += 1
    return count

with open(teams_file, mode='r', newline='', encoding='utf-8') as read_file:
    reader = csv.reader(read_file)
    next(reader)
    with open(player_file, mode='w', newline='', encoding='utf-8') as write_file:
        writer = csv.writer(write_file)
        writer.writerow(['name', 'id', 'actual_name'])
        for row in reader:
            team_name = row[0]
            team_id = row[1]
            for year in range(2010, 2024):
                print(f'Getting players for {team_name} in {year}')
                print(f'Found {get_players(team_url.replace('team_name', team_name).replace('id', team_id), year)} players')



In [None]:
import csv
import aiohttp
import asyncio

# Check if a player is a known player

value_threshold = {
    '2010': 30, '2011': 30, '2012': 35, '2013': 35, '2014': 40, '2015': 40,
    '2016': 45, '2017': 45, '2018': 50, '2019': 50, '2020': 55, '2021': 60,
    '2022': 65, '2023': 70, '2024': 75
}

extra_players = {
    79422: 'Keylor Navas',
    29260: 'Giorgio Chiellini'
}

async def is_endpoint(session, player_id, player_name):
    url = f'https://www.transfermarkt.com/ceapi/marketValueDevelopment/graph/{player_id}'
    try:
        async with session.get(url) as response:
            data = await response.json()
            highest_market_value = data['highest']
            if highest_market_value[-1:] != 'm':
                return False
            highest_market_value = float(highest_market_value[1:-1])
            highest_market_value_date = data['highest_date']
            year = highest_market_value_date[-4:]
            if year not in value_threshold:
                year = '2010'
            return value_threshold[year] <= highest_market_value
    except Exception as e:
        print(f'Error processing {player_name} ({player_id}): {e}')
        return False

def chunk_list(data, size):
    for i in range(0, len(data), size):
        yield data[i:i + size]

async def load_players_and_check_endpoints():
    async with aiohttp.ClientSession() as session:
        players_to_process = []
        with open('players.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            for row in reader:
                players_to_process.append((row[1], row[2]))

        with open('endpoints.csv', mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['name', 'id'])
            count = 0
            batch_count = 0
            for batch in chunk_list(players_to_process, 100):
                tasks = [asyncio.create_task(is_endpoint(session, id, name)) for id, name in batch]
                results = await asyncio.gather(*tasks)
                for result, (id, name) in zip(results, batch):
                    if result:
                        writer.writerow([name, id])
                        count += 1
                batch_count += len(batch)
                print(f'Processed {batch_count} players, found {count} endpoints')
            
            # Process extra players
            for id, name in extra_players.items():
                writer.writerow([name, id])
                count += 1
                print(f'{name} {id} is an endpoint')
            
            print(f'Total endpoints found: {count}')
        print('Processing complete.')

loop = asyncio.get_event_loop()

if not loop.is_running():
    loop.run_until_complete(load_players_and_check_endpoints())
else:
    task = loop.create_task(load_players_and_check_endpoints())


In [None]:
import csv
import aiohttp
import asyncio
import re

players_file = 'players.csv'
player_transfers_file = 'player_transfers.csv'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def chunk_list(data, size):
    for i in range(0, len(data), size):
        yield data[i:i + size]

async def get_player_transfers(session, player_id, writer):
    url = f'https://www.transfermarkt.com/ceapi/transferHistory/list/{player_id}'
    count = 0
    try:
        async with session.get(url, headers=headers) as response:
            transfers = await response.json()
            for transfer in transfers['transfers']:
                from_match = re.search(r'/(?P<name>[^/]+)\/transfers/verein/(?P<id>\d+)/saison_id/\d+', transfer['from']['href'])
                to_match = re.search(r'/(?P<name>[^/]+)\/transfers/verein/(?P<id>\d+)/saison_id/\d+', transfer['to']['href'])
                if from_match and to_match:
                    from_id = from_match.group('id')
                    to_id = to_match.group('id')
                    date = transfer['date']
                    writer.writerow([player_id, from_id, to_id, date])
                    count += 1
    except Exception as e:
        print(f"Error fetching transfers for player {player_id}: {e}")
    return count


async def process_players():
    async with aiohttp.ClientSession() as session:
        players_to_process = []
        with open('players.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            for row in reader:
                players_to_process.append((row[1], row[2]))
        
        with open('player_transfers.csv', mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['player_id', 'from_id', 'to_id', 'date'])
            total_transfers = 0
            batch_count = 0
            for batch in chunk_list(players_to_process, 100):
                tasks = [get_player_transfers(session, player_id, writer) for player_id, _ in batch]
                results = await asyncio.gather(*tasks)
                print(f"Processed batch: {sum(results)} transfers")
                total_transfers += sum(results)
                batch_count += 100
                print(f'Processed {batch_count} players')
        
        print(f'Total transfers found: {total_transfers}')
        print('Processing complete.')

loop = asyncio.get_event_loop()

if not loop.is_running():
    loop.run_until_complete(process_players())
else:
    task = loop.create_task(process_players())


In [11]:
import csv
from datetime import datetime

teams_file = 'teams.csv'
teams = set()

def read_teams():
    with open(teams_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            teams.add(row[1])
    print(f'Loaded {len(teams)} teams')


def remove_unwanted_transfers():
    with open('player_transfers.csv', mode='r', newline='', encoding='utf-8') as read_file:
        reader = csv.reader(read_file)
        next(reader)
        with open('filtered_transfers.csv', mode='w', newline='', encoding='utf-8') as write_file:
            writer = csv.writer(write_file)
            writer.writerow(['player_id', 'from_id', 'to_id', 'date'])
            total_transfers = 0
            filtered_transfers = 0
            for row in reader:
                if row[3] == '':
                    continue
                if row[1] in teams and row[2] in teams:
                    writer.writerow(row)
                    filtered_transfers += 1
                elif row[1] in teams:
                    row[2] = '123'
                    writer.writerow(row)
                    filtered_transfers += 1
                elif row[2] in teams:
                    row[1] = '123'
                    writer.writerow(row)
                    filtered_transfers += 1
                total_transfers += 1
                if total_transfers % 1000 == 0:
                    print(f'Current progress: {filtered_transfers} / {total_transfers}')


            
            print(f'{filtered_transfers} / {total_transfers} transfers kept')
            print('Processing complete.')

read_teams()
remove_unwanted_transfers()
            

Loaded 294 teams
Current progress: 633 / 1000
Current progress: 1213 / 2000
Current progress: 1804 / 3000
Current progress: 2385 / 4000
Current progress: 2962 / 5000
Current progress: 3468 / 6000
Current progress: 4114 / 7000
Current progress: 4688 / 8000
Current progress: 5295 / 9000
Current progress: 5850 / 10000
Current progress: 6382 / 11000
Current progress: 6872 / 12000
Current progress: 7364 / 13000
Current progress: 7857 / 14000
Current progress: 8359 / 15000
Current progress: 8849 / 16000
Current progress: 9282 / 17000
Current progress: 9784 / 18000
Current progress: 10327 / 19000
Current progress: 10762 / 20000
Current progress: 11255 / 21000
Current progress: 11753 / 22000
Current progress: 12193 / 23000
Current progress: 12626 / 24000
Current progress: 13162 / 25000
Current progress: 13664 / 26000
Current progress: 14208 / 27000
Current progress: 14709 / 28000
Current progress: 15236 / 29000
Current progress: 15652 / 30000
Current progress: 16100 / 31000
Current progress: 1

In [16]:
import csv
from datetime import datetime

filtered_transfers_file = 'filtered_transfers.csv'
final_transfers_file = 'final_transfers.csv'
NULL = 'NULL'
def fix_dates(date):
    month = date[:3]
    if month == 'May' or month == 'Jun' or month == 'Jul' or month == 'Aug':
        return 'Jul 1, ' + date[-4:]
    return date

with open(file=filtered_transfers_file, mode='r', newline='', encoding='utf-8') as read_file:
    reader = csv.reader(read_file)
    next(reader)
    with open(file=final_transfers_file, mode='w', newline='', encoding='utf-8') as write_file:
        writer = csv.writer(write_file)
        writer.writerow(['player_id', 'team_id', 'from_date', 'to_date'])
        current_player = None
        old_date = None
        for row in reader:
            player_id = row[0]
            to_team_id = row[2]
            date = fix_dates(row[3])
            if current_player != player_id:
                current_player = player_id
                if to_team_id != '123' and datetime.strptime(date, '%b %d, %Y') < datetime.strptime('Jun 1, 2024', '%b %d, %Y'):
                    writer.writerow([current_player, to_team_id, date, NULL])
            else:
                if to_team_id != '123':
                    if old_date != date:
                        writer.writerow([current_player, to_team_id, date, old_date])
            old_date = date
        
print('Processing complete')

            

Processing complete


In [17]:
import csv

old_players_file = 'players.csv'
new_players_file = 'new_players.csv'
transfers_file = 'final_transfers.csv'
player_ids = set()

with open(file=transfers_file, mode='r', newline='', encoding='utf-8') as read_file:
    reader = csv.reader(read_file)
    next(reader)
    for row in reader:
        player_ids.add(row[0])

with open(file=old_players_file, mode='r', newline='', encoding='utf-8') as read_file:
    reader = csv.reader(read_file)
    next(reader)
    with open(file=new_players_file, mode='w', newline='', encoding='utf-8') as write_file:
        writer = csv.writer(write_file)
        writer.writerow(['name', 'id', 'actual_name'])
        for row in reader:
            if row[1] in player_ids:
                writer.writerow(row)

# replace old players file with new players file
import os
os.remove(old_players_file)
os.rename(new_players_file, old_players_file)

In [19]:
import networkx as nx
import csv
from datetime import datetime

player_file = 'players.csv'
transfers_file = 'final_transfers.csv'
players = {}
transfers = {}
clubs = {}

G = nx.Graph()

def find_edges(player_id):
    count = 0
    already_edges = set()
    player_clubs = transfers[player_id]
    for club_id, from_date, to_date in player_clubs:
        edge_options = clubs[club_id]
        for option in edge_options:
            if option in already_edges:
                continue
            if option != player_id:
                option_clubs = transfers[option]
                for option_club_id, option_from_date, option_to_date in option_clubs:
                    if option_club_id == club_id:
                        if from_date >= option_from_date and (option_to_date == 'NULL' or from_date < option_to_date):
                            G.add_edge(player_id, option)
                            already_edges.add(option)
                            count += 1
                            break
                        if option_from_date >= from_date and (to_date == 'NULL' or option_from_date < to_date):
                            G.add_edge(player_id, option)
                            already_edges.add(option)
                            count += 1
                            break
    print(f'Found {count} edges for player {player_id}')


with open(player_file, mode='r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
        players[row[1]] = row[2]
        G.add_node(row[1])

with open(transfers_file, mode='r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
        if row[0] in transfers:
            if row[3] == 'NULL':
                transfers[row[0]].append((row[1], datetime.strptime(row[2], "%b %d, %Y").date(), 'NULL'))
            else:
                transfers[row[0]].append((row[1], datetime.strptime(row[2], "%b %d, %Y").date(), datetime.strptime(row[3], "%b %d, %Y").date()))
        else:
            if row[3] == 'NULL':
                transfers[row[0]] = [(row[1], datetime.strptime(row[2], "%b %d, %Y").date(), 'NULL')]
            else:
                transfers[row[0]] = [(row[1], datetime.strptime(row[2], "%b %d, %Y").date(), datetime.strptime(row[3], "%b %d, %Y").date())]
        if row[1] in clubs:
            clubs[row[1]].append(row[0])
        else:
            clubs[row[1]] = [row[0]]

for player_id in players:
    find_edges(player_id)

print(f'Found {G.number_of_edges()} edges for {G.number_of_nodes()} nodes')

endpoints = []
with open('endpoints.csv', mode='r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
        endpoints.append(row[1])
    
degrees = []

for endpoint in endpoints:
    degrees_for_endpoint = []
    for endpoint2 in endpoints:
        if endpoint != endpoint2:
            print(f'Calculating shortest path between {endpoint} and {endpoint2}')
            d = nx.shortest_path_length(G, source=endpoint, target=endpoint2)
            degrees_for_endpoint.append(d)
        else:
            degrees_for_endpoint.append(0)
    degrees.append(degrees_for_endpoint)

print(degrees)

paths = []
for endpoint in endpoints:
    paths_for_endpoint = []
    for endpoint2 in endpoints:
        if endpoint != endpoint2:
            print(f'Calculating shortest path between {endpoint} and {endpoint2}')
            p = nx.shortest_path(G, source=endpoint, target=endpoint2)
            paths_for_endpoint.append(p)
        else:
            paths_for_endpoint.append([endpoint])
    for i, path in enumerate(paths_for_endpoint):
        paths_for_endpoint[i] = [players[node] for node in path]
    paths.append(paths_for_endpoint)

print('Done')

for path in paths:
    print(path)






Found 238 edges for player 40204
Found 164 edges for player 3146
Found 193 edges for player 3190
Found 121 edges for player 52267
Found 151 edges for player 9594
Found 165 edges for player 3202
Found 219 edges for player 26485
Found 179 edges for player 4241
Found 81 edges for player 28810
Found 158 edges for player 88262
Found 84 edges for player 61632
Found 70 edges for player 114093
Found 229 edges for player 74810
Found 259 edges for player 46156
Found 193 edges for player 3682
Found 186 edges for player 20007
Found 211 edges for player 32617
Found 138 edges for player 58579
Found 58 edges for player 99452
Found 310 edges for player 111433
Found 64 edges for player 121416
Found 40 edges for player 121404
Found 235 edges for player 4672
Found 223 edges for player 3291
Found 114 edges for player 45274
Found 135 edges for player 3183
Found 149 edges for player 13091
Found 255 edges for player 3333
Found 65 edges for player 121406
Found 58 edges for player 121408
Found 121 edges for pl