In [88]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
#Initialize
team_id_pattern = r"\/verein\/(\d+)\/saison_id\/\d+"
player_id_pattern = r"\/profil\/spieler\/(\d+)"
HEADERS = {'User-Agent': 'Mozilla/5.0'}

def cleaner(string:str)->str:
    string = string.replace("\xa0"," ").replace("\n"," ").replace("  ","")
    return string.strip()

def team_id_extractor(url_inp:str)->str:
    match = re.search(team_id_pattern, url_inp)
    team_id = '0'
    if match:
        team_id = match.group(1)
    return(team_id)

def player_id_extractor(url_inp:str)->str:
    match = re.search(player_id_pattern, url_inp)
    player_id = '0'
    if match:
        player_id = match.group(1)
    return(player_id)

with open('links.txt', 'r') as file:
    urls = [line.strip() for line in file.readlines()]
    

# Initialize table players_ex_gk_detailed
columns_players_ex_gk_detailed = ['Player_id','Season','Competition','Squad','Apperance','PPG','Goals','Assists','Own_goal','Substitutions_on','Substitutions_off','Yellow_card','Second_yellow_card','Red_card','Penalty_goals','Minutes_per_goal','Minutes_played','Club','Team_id']
Players_ex_gk_detailed_table = pd.DataFrame(columns=columns_players_ex_gk_detailed)
total_apperance_info_players = pd.DataFrame(columns=columns_players_ex_gk_detailed[3:-2])  
columns_players_ex_gk_detailed.remove('Player_id')
# To handle the creation of two empty columns that may occur during scraping, we can temporarily add two columns with the names "temp1" and "temp2". After creating the DataFrame table, we will drop these columns.
columns_players_ex_gk_detailed.insert(1, 'temp1')
columns_players_ex_gk_detailed.insert(3, 'temp2')

# Initialize table gk_detailed
columns_gk_detailed=['Player_id','Season','Competition','Squad','Apperance','PPG','Goals','Own_goal','Substitutions_on','Substitutions_off','Yellow_card','Second_yellow_card','Red_card','Goals_conceded','Clean_sheets','Minutes_played','Club','Team_id']
Gk_detailed_table = pd.DataFrame(columns=columns_gk_detailed)
total_apperance_info_gks = pd.DataFrame(columns=columns_gk_detailed[3:-2])  
columns_gk_detailed.remove('Player_id')
# To handle the creation of two empty columns that may occur during scraping, we can temporarily add two columns with the names "temp1" and "temp2". After creating the DataFrame table, we will drop these columns.
columns_gk_detailed.insert(1, 'temp1')
columns_gk_detailed.insert(3, 'temp2')

max_retries = 5


for count,url in enumerate((urls)):
    print(f"{count+1}/{len(urls)}", end="\r")
    player_id = player_id_extractor(url)
    
    record = {}
        
    #To create a URL for a details page
    url = url.replace("profil", "leistungsdatendetails")+ "/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1"
    
    retries = 0
    success = False
    # Retry the request until success or maximum retries reached
    while retries < max_retries and not success:
        try:
            html_content = requests.get(url,headers=HEADERS).text
            success = True
        except requests.exceptions.RequestException:
            print(f"Error occurred while making a request. Retrying in 5 seconds...")
            time.sleep(5)
            retries += 1
    
    soup = BeautifulSoup(html_content, "html.parser")
    # Find the table element using its class or id
    position = 'Non-GK'
    for li in soup.find_all('div' , class_='data-header__info-box')[0].find_all('li' , class_='data-header__label') :
        if 'Position:' in li.text:
            position = cleaner(li.find('span').text)
    
    table = soup.find_all("table")[1]
    
    rows = []
    for tr in table.find_all("tr"):
        try:
            team_name = tr.find_all("td", class_="hauptlink no-border-rechts zentriert")[0].find("a").get('title')
            team_link = tr.find_all("td", class_="hauptlink no-border-rechts zentriert")[0].find("a").get('href')
        except:
            team_name = 0
            team_link = '0'
        
        match = re.search(team_id_pattern, team_link)
        team_id = 0
        if match:
            team_id = match.group(1)   
        
        cells = []
        for td in tr.find_all("td"):
            cells.append(td.text.strip())
        cells.append(team_name)
        cells.append(team_id)      
        rows.append(cells)
    
    if position == 'Goalkeeper':
        df = pd.DataFrame(rows, columns=columns_gk_detailed)
    else:
        df = pd.DataFrame(rows, columns=columns_players_ex_gk_detailed)              
    
    # Drop useless columns and row  
    df = df.drop(columns='temp1')
    df = df.drop(columns='temp2')
    df = df.drop(index=0)
    total_inf = df.iloc[0, 2:-2]
    total_inf = total_inf.append(pd.Series([player_id]))
    # Drop the total information from the detailed table of the player.
    df = df.drop(index=1)
    
    num_rows = len(df)
    new_col_values = [player_id] * num_rows
    df.insert(0, 'Player_id', new_col_values)
    
    if position == 'Goalkeeper':
        Gk_detailed_table = Gk_detailed_table.append(df, ignore_index=True)
        total_apperance_info_gks = total_apperance_info_gks.append(total_inf, ignore_index=True)
    else:
        Players_ex_gk_detailed_table = Players_ex_gk_detailed_table.append(df, ignore_index=True)
        total_apperance_info_players = total_apperance_info_players.append(total_inf, ignore_index=True)
total_apperance_info_players = total_apperance_info_players.rename(columns={0: 'player_id'})
total_apperance_info_gks = total_apperance_info_gks.rename(columns={0: 'player_id'})

2197/2197

In [89]:
total_apperance_info_players.to_json('total_apperance_info_players.json', orient='records', indent=4)
total_apperance_info_gks.to_json('total_apperance_info_gks.json', orient='records', indent=4)
Gk_detailed_table.to_json('Gk_detailed_table.json', orient='records', indent=4)
Players_ex_gk_detailed_table.to_json('Players_ex_gk_detailed_table.json', orient='records', indent=4)


In [90]:
# Gk_detailed_table
# Players_ex_gk_detailed_table
# total_apperance_info_gks
# total_apperance_info_players
# type(total_inf)

Unnamed: 0,Player_id,Season,Competition,Squad,Apperance,PPG,Goals,Assists,Own_goal,Substitutions_on,Substitutions_off,Yellow_card,Second_yellow_card,Red_card,Penalty_goals,Minutes_per_goal,Minutes_played,Club,Team_id
0,434207,22/23,Championship,5,5,1.60,2,-,-,1,3,1,-,-,-,170',340',Hull City,3008
1,434207,22/23,FA Cup,1,1,000,-,-,-,1,-,-,-,-,-,-,29',Hull City,3008
2,434207,22/23,Serie B,5,5,0.80,-,-,-,3,2,1,-,-,-,-,187',Venezia FC,607
3,434207,21/22,Championship,21,19,1.47,2,-,-,6,8,2,-,-,-,593',1.185',Middlesbrough FC,641
4,434207,21/22,FA Cup,2,2,1.50,-,-,-,1,1,-,-,-,-,-,88',Middlesbrough FC,641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68851,3455,02/03,Johan Cruijff Schaal,1,1,3.00,-,-,-,1,-,-,-,-,-,-,13',Ajax Amsterdam,610
68852,3455,01/02,Amstel Cup,3,3,3.00,1,1,-,1,2,-,-,-,-,153',153',Ajax Amsterdam,610
68853,3455,01/02,KPN Eredivisie,24,24,2.25,6,4,-,12,7,2,-,-,-,213',1.280',Ajax Amsterdam,610
68854,3455,01/02,UEFA Cup,4,4,1.75,2,-,-,1,2,-,-,-,-,136',272',Ajax Amsterdam,610
