In [1]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import re

In [2]:
def get_teams(league_url, years, league_id):
    '''Scrape team information for certain leagues and given years.'''
    df = pd.DataFrame()
    if type(years) == int:
        years = [years]
    for year in years:
        url = league_url + f"/plus/?saison_id={year}"

        # exception handling
        r = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        html = urlopen(r)
        bs = BeautifulSoup(html, 'html.parser')
        
        # Get the team names
        team_rows = bs.find('table', {'class': 'items'}).find_all('td', {"class":"hauptlink no-border-links"})
        
        teams = {}
        for row in team_rows:
            team_name = row.text.strip().split(' \\')[0]
            team_href = row.find('a')['href']
            team_id = team_href.split('/')[4]
            teams[team_name]={'href': team_href, 'id': team_id}
        # TODO maybe also add the market value of the team
        # turn into df
        teams_df = pd.DataFrame.from_dict(teams, orient="index").reset_index(drop=False, names="team_name")
        teams_df["year"] = year
        teams_df["league_id"] = league_id
        teams_df["top_flight"] = 1
        df = pd.concat([df, teams_df], axis=0)
    return df

In [7]:
def get_players(team_url):
    r = Request(team_url, headers={'User-Agent': 'Mozilla/5.0'})
    html = urlopen(r)
    bs = BeautifulSoup(html, 'html.parser')
    # Get the team names
    try:
        # Find all 'a' tags with 'href' attributes
        player_rows = bs.find('table', {'class': 'items'}).find_all('a', href=True)
        # Define the regex pattern to match player URLs
        regex_pattern = r"/[^/]+/profil/spieler/\d+"
        players = {}
        for row in player_rows:
            if re.search(regex_pattern, row['href']):
                player_name = row.text
                player_href = row["href"]
                player_id = player_href.split("/")[-1]
                players[player_id] = {"player_href": player_href, "player_name":player_name}

        player_dates, player_numbers = [], []
        team_rows = bs.find('table', {'class': 'items'}).find_all('td', {"class":"zentriert"})
        for row in team_rows:
            if row.get_text()== '':
                continue
            elif len(row.get_text())>=3:
                player_dates.append(row.get_text())
            else:
                player_numbers.append(row.get_text())


        # not ideal but add the dates and numbers based on their index position
        if (len(players.keys()) == len(player_dates)) & (len(players.keys()) == len(player_numbers)):
            for player_id in players.keys():
                players[player_id]["Birthday"] = player_dates[list(players.keys()).index(player_id)]
                players[player_id]["Number"] = player_numbers[list(players.keys()).index(player_id)]
        elif(len(players.keys()) == len(player_dates)):
            for player_id in players.keys():
                players[player_id]["Birthday"] = player_dates[list(players.keys()).index(player_id)]
            print(f"Not matching numbers {team_url}")
        elif(len(players.keys()) == len(player_numbers)):
            for player_id in players.keys():
                players[player_id]["Number"] = player_numbers[list(players.keys()).index(player_id)]
            print(f"Not matching dates {team_url}")
        else:
            print(f"Not matching dates and numbers {team_url}")
        # Create a DataFrame from the dictionary
        player_df = pd.DataFrame.from_dict(players, orient='index').reset_index(drop=False, names="player_id")
    except AttributeError:
        print(f"No data for  {team_url}")
        player_df = pd.DataFrame()
    return player_df

def get_player_info(url):
    # retrieves information from a players web site
    r = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    html = urlopen(r)
    bs = BeautifulSoup(html, 'html.parser')
    
    player_info = {}
    player_info["player_id"] = url.split("/")[-1]

    # Get the team names
    hrefs, transfer_years, club_ids = [], [], []
    grid = bs.find_all("div", {"class":"tm-player-transfer-history-grid"})
    for entry in grid:
        old_club = entry.find("div", {"class":"tm-player-transfer-history-grid__old-club"})
        if None == old_club: # handle None matches
            continue
        else:
            if "grid__heading" in old_club["class"]:# exclude the header
                continue
            try:
                href = old_club.find("a", {"class":"tm-player-transfer-history-grid__club-link"})["href"]
            except TypeError as e:
                href = old_club.find("a")["href"]
            transfer_year = href.split("/")[-1]
            club_id = href.split("/")[-3]
            hrefs.append(href)
            transfer_years.append(transfer_year)
            club_ids.append(club_id)
    player_info["transfer_hrefs"] = hrefs
    player_info["transfer_years"] = transfer_years

    # current club id
    current_club_id = []
    for entry in grid:
        old_club = entry.find("div", {"class":"tm-player-transfer-history-grid__new-club"})
        if None == old_club: # handle None matches
            continue
        else:
            if "grid__heading" in old_club["class"]:# exclude the header
                continue
            href = old_club.find("a")["href"]
            club_id = href.split("/")[-3]
            current_club_id.append(club_id)
            break # stop after the first found element
    player_info["current_club"] = current_club_id
    player_info["transfer_club_ids"] = club_ids

    # market value
    current_mv = bs.find_all("div", {"class":"tm-player-market-value-development__current-value"})
    for entry in current_mv:
        player_current_mv = entry.get_text()
        player_info["current_mv"] = player_current_mv

    # max market value
    max_mv = bs.find_all("div", {"class":"tm-player-market-value-development__max-value"})
    for entry in max_mv:
        player_max_mv = entry.get_text()
        player_info["max_mv"]= player_max_mv
    
    # position
    player_positions = []
    positions = bs.find_all("dd", {"class":"detail-position__position"})
    for position in positions:
        player_positions.append(position.get_text())
    if len(player_positions)>0:
        player_main_position = [player_positions[0]]
        player_info["main_position"] = player_main_position
    if len(player_positions)>1: 
        player_other_positions = player_positions[1:]
        player_info["other_positions"] = player_other_positions

    # nationality
    player = bs.find('div', {'class': 'info-table'})
    nations = player.find_all("img", {"class": "flaggenrahmen"})
    player_nations = []
    for nation in nations:
        if "lazy" in nation["class"]:
            continue
        player_nations.append(nation["title"])
    player_info["nationality"] = player_nations

    player_info = {key: [value] for key, value in player_info.items()}
    player_info_df = pd.DataFrame(player_info)
    return player_info_df

In [12]:
from tqdm import tqdm
def load_players_info_for_team(team_url, base_url):
    # combine the information derived from get_players and get_player_info
    team_id = team_url.split('/')[6]
    df = get_players(team_url)
    # stop here and then continue with additional info
    additional_df = pd.DataFrame()
    for href in df["player_href"]:#tqdm(df['player_href'], total=len(df)):#df.player_href:
        # print(f"Starting {list(df.player_href).index(href)+1}/{len(list(df.player_href))} - {datetime.datetime.now()}")
        href = base_url + href
        player_info_df = get_player_info(href)
        additional_df = pd.concat([additional_df, player_info_df], axis=0)
    df = pd.merge(df, additional_df, on="player_id", how="left")
    return df

def get_players_for_all_teams(df):
    # get the players for all teams in the league
    players_df = pd.DataFrame()
    for index, row in tqdm(df.iterrows(), total=len(df)):
        # maybe dont include the additional info for players but only scrape the team site
        player_df=load_players_info_for_team("https://www.transfermarkt.com" + row.href, "https://www.transfermarkt.com")
        # player_df = get_players("https://www.transfermarkt.com" + row.href)
        players_df = pd.concat([players_df, player_df], axis=0)
    if "Unnamed: 0" in players_df.columns:
        players_df = players_df.drop("Unnamed: 0", axis=1)
    '''
    columns_with_list_type = ["transfer_years", "transfer_hrefs", "transfer_club_ids", "main_position", "other_positions", "nationality"]
    columns_with_list_type = [column for column in columns_with_list_type if column in players_df.columns]
    for column in columns_with_list_type:
        players_df[column] = players_df[column].apply(lambda x: str(x))
    subset = ["players", "player_href", "player_id"]
    '''
    players_df = players_df.drop_duplicates(subset = "player_id").reset_index(drop=True)
    return players_df

In [34]:
# Acess the teams for given leagues and years
# leagues = ["Premier League", 'Jupiler Pro League', "Bundesliga",'3. Liga', "Serie A", "La Liga", "Ligue 1", "Eredivisie",
#             'Championship', 'La Liga 2', 'Serie B', '2. Bundesliga', 'Ligue 2', 'Liga Portugal', 'Super Lig', 'Major League Soccer',
#             "Pro League", "Super League"]

# league_urls = {"/bundesliga/startseite/wettbewerb/L1":1, "/premier-league/startseite/wettbewerb/GB1":2,
#                 "/primera-division/startseite/wettbewerb/ES1":3, "/serie-a/startseite/wettbewerb/IT1":4,
#                 "/ligue-1/startseite/wettbewerb/FR1":5 }


# /liga-portugal/startseite/wettbewerb/PO1
# /super-lig/startseite/wettbewerb/TR1
# /jupiler-pro-league/startseite/wettbewerb/BE1
# /eredivisie/startseite/wettbewerb/NL1

# league_urls = {"/bundesliga/startseite/wettbewerb/L2":6, "/premier-league/startseite/wettbewerb/GB2":7,
#                "/primera-division/startseite/wettbewerb/ES2":8, "/serie-a/startseite/wettbewerb/IT2":9,
#                "/ligue-1/startseite/wettbewerb/FR2":10}

league_urls = {"/liga-portugal/startseite/wettbewerb/PO1": 11, "/super-lig/startseite/wettbewerb/TR1": 12,
               "/jupiler-pro-league/startseite/wettbewerb/BE1": 13, "/eredivisie/startseite/wettbewerb/NL1": 14,
               "/major-league-soccer/startseite/wettbewerb/MLS1":15, "/3-liga/startseite/wettbewerb/L3": 16,
               "/saudi-professional-league/startseite/wettbewerb/SA1": 17, "/chinese-super-league/startseite/wettbewerb/CSL": 18}

combined_df = pd.DataFrame()
for league_url in league_urls.keys():
    print(f"Starting {list(league_urls.keys()).index(league_url)+1}/{len(list(league_urls.keys()))} - {datetime.datetime.now()}")
    df = get_teams("https://www.transfermarkt.com" + league_url, [2023], league_urls[league_url])
    combined_df = pd.concat([df, combined_df], axis=0)
    print(f"Finished {list(league_urls.keys()).index(league_url)+1}/{len(list(league_urls.keys()))} - {datetime.datetime.now()}")

# print(combined_df["href"])


# access the players for given teams
df = combined_df.copy()
players_df = get_players_for_all_teams(df.iloc[:,:])
# print(players_df.head())
players_df["current_mv"] = players_df["current_mv"].apply(lambda x: str(x).replace(" ", "").replace("\n", ""))
players_df["max_mv"] = players_df["max_mv"].apply(lambda x: str(x).replace(" ", "").replace("\n", ""))
players_df["Birthday"] = pd.to_datetime(players_df['Birthday'].str.extract('(\w+ \d{1,2}, \d{4})')[0])
players_df.to_csv("players_df_others.csv", index=False)


Starting 1/8 - 2023-10-03 21:35:37.795657
Finished 1/8 - 2023-10-03 21:35:38.733653
Starting 2/8 - 2023-10-03 21:35:38.733653
Finished 2/8 - 2023-10-03 21:35:40.079178
Starting 3/8 - 2023-10-03 21:35:40.079178
Finished 3/8 - 2023-10-03 21:35:40.770520
Starting 4/8 - 2023-10-03 21:35:40.770520
Finished 4/8 - 2023-10-03 21:35:41.562399
Starting 5/8 - 2023-10-03 21:35:41.562399
Finished 5/8 - 2023-10-03 21:35:42.447632
Starting 6/8 - 2023-10-03 21:35:42.447632
Finished 6/8 - 2023-10-03 21:35:43.271527
Starting 7/8 - 2023-10-03 21:35:43.271527
Finished 7/8 - 2023-10-03 21:35:45.156582
Starting 8/8 - 2023-10-03 21:35:45.156582
Finished 8/8 - 2023-10-03 21:36:20.575016


  1%|▏         | 2/155 [04:08<5:16:39, 124.18s/it]


KeyboardInterrupt: 