In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re


#Initialize
team_id_pattern = r"\/verein\/(\d+)\/saison_id\/\d+"
player_id_pattern = r"\/profil\/spieler\/(\d+)"
HEADERS = {'User-Agent': 'Mozilla/5.0'}

def cleaner(string:str)->str:
    if type(string) == str:
        string = string.replace("\xa0"," ").replace('\u00e9','').replace('\u011f','').replace("\n"," ").replace("  ","").replace('\u00e1','').replace(":","").replace('\u20ac','')
        return string.strip()
    else:
        return string

def team_id_extractor(url_inp:str)->str:
    match = re.search(team_id_pattern, url_inp)
    team_id = '0'
    if match:
        team_id = match.group(1)
    return(team_id)

def player_id_extractor(url_inp:str)->str:
    match = re.search(player_id_pattern, url_inp)
    player_id = '0'
    if match:
        player_id = match.group(1)
    return(player_id)

with open('links.txt', 'r') as file:
    urls = [line.strip() for line in file.readlines()]
    
titles = ['Player_id', 'Season','Date','Left','Joined','MV','Fee','origin_id','destination_id'] 
players_transfers = pd.DataFrame(columns=titles)

class_list = ["grid__cell grid__cell--center tm-player-transfer-history-grid__season",
              'grid__cell grid__cell--center tm-player-transfer-history-grid__date',
              "grid__cell grid__cell--center tm-player-transfer-history-grid__old-club",
              "grid__cell grid__cell--center tm-player-transfer-history-grid__new-club",
              "grid__cell grid__cell--center tm-player-transfer-history-grid__market-value",
              "grid__cell grid__cell--center tm-player-transfer-history-grid__fee"]

for count,url in enumerate((urls)):
    print(f"{count+1}/{len(urls)}", end="\r")
    player_id = player_id_extractor(url)
    url = url.replace("profil", "transfers")
    html_content = requests.get(url,headers=HEADERS).text
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    # soup.find_all("div",class_="grid__cell grid__cell--center tm-player-transfer-history-grid__market-value")

    rows = []
    table = soup.find_all('div' , class_= "box viewport-tracking")[0]
    for i in table.find_all('div' , class_="grid tm-player-transfer-history-grid"):
        cells = []
        cells.append(player_id)
        for j in range(6):
            cells.append(cleaner(i.find('div' , class_=class_list[j]).text))
        
        try:
            cells.append(team_id_extractor(i.find_all('a' , class_='tm-player-transfer-history-grid__club-link')[0].get('href')))
        except:
            cells.append('0')
        
        try:
            cells.append(team_id_extractor(i.find_all('a' , class_='tm-player-transfer-history-grid__club-link')[1].get('href')))
        except: 
            cells.append('0')
        rows.append(cells)
    df = pd.DataFrame(rows, columns=titles)
    players_transfers = players_transfers.append(df, ignore_index=True)
players_transfers.to_json('players_transfers.json', orient='records', indent=4)

2197/2197

In [5]:
data = pd.read_json('players_transfers.json')
df = pd.DataFrame(data)

players_transfers_final = df.applymap(cleaner)
players_transfers_final = players_transfers_final[players_transfers_final['Season'].isin(list(players_transfers_final['Season'].unique()[1:8]))]
players_transfers_final.to_json('players_transfers.json', orient='records', indent=4)

In [6]:
players_transfers_final

Unnamed: 0,Player_id,Season,Date,Left,Joined,MV,Fee,origin_id,destination_id
1,427605,21/22,"Aug 26, 2021",Hatayspor,Al-Arabi SC,8.50m,4.25m,7775,1230
2,427605,20/21,"Aug 5, 2020",G. Bordeaux,Hatayspor,200k,free transfer,40,7775
3,427605,19/20,"Mar 11, 2020",Feirense,G. Bordeaux,200k,End of loan,3349,40
4,427605,19/20,"Jul 6, 2019",G. Bordeaux,Feirense,150k,loan transfer,40,3349
5,427605,18/19,"Jun 30, 2019",Tours FC,G. Bordeaux,150k,End of loan,3161,40
...,...,...,...,...,...,...,...,...,...
88279,199268,15/16,"Jan 28, 2016",RB Leipzig,FSV Frankfurt,600k,loan transfer,23826,293
88286,663519,20/21,"Sep 1, 2020",Juventus U17,Juventus U19,-,-,49567,11008
88287,663519,19/20,"Jul 1, 2019",Juventus Youth,Juventus U17,-,-,5232,49567
88288,663519,18/19,"Mar 28, 2019",Halads Yth.,Juventus Youth,-,free transfer,55730,5232
