# Data collection of soccer players

## Libraries

In [14]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
#from tqdm import tqdm
from tqdm.notebook import trange, tqdm
import time 
import csv
import re

##Functions

### id

In [15]:
def player_id(link):
  try:
    player_id = (str(link).split('/'))[-1]
    return player_id
  except:
    print(link+" player id not found!")
    return np.nan

### name and lastname

In [16]:
def player_name (page_content,link):
  try:
    #get the name tag
    full_name = page_content.find('h1',class_='data-header__headline-wrapper').get_text()
    #Remove extra space
    full_name = (" ".join(full_name.strip().split()))
    #Separating name and last name
    full_name = re.sub(r'#\d+','', full_name).strip()
    name = full_name.split(' ')[0]
    last_name = full_name.split(' ')[-1]

    return full_name
    #return name, last_name
    #print(full_name,'\n'+name ,'\n'+last_name)
  
  except:
    print(link+" player name not found!")
    return np.nan

### Place of birth

In [17]:
def place_of_birth(page_content, link):
  try:
    place_of_birth = page_content.find(['span'], string=['Place of birth:']).find_next('img')['title']
    return place_of_birth

  except:
    print(link+" player place of birth not found!")
    return np.nan

### Date of birth (age)

In [18]:
# date of birth
def date_of_birth(page_content, link):
  try:
    date_of_birth = page_content.find(['span'], string=['Date of birth:']).find_next('span').text
    date_of_birth = date_of_birth.strip().replace(',','').split(' ')
    month = date_of_birth[0]
    day = date_of_birth[1]
    year = date_of_birth[2]
    #print(year, '\n'+month, '\n'+day)
    return date_of_birth
  
  except:
    print(link+" player date of birth not found!")
    return np.nan

In [19]:
# age
def player_age(page_content, link):
  try:
    age = page_content.find(['span'], string=['Age:']).find_next('span').text
    #print(age)
    return age
  
  except:
    print(link+" player age of birth not found!")
    return np.nan

### height

In [20]:
def player_height (page_content, link):
  try:
    height = page_content.find(['span'], string=['Height:']).find_next('span').text
    #cleaning the text and cast it to int
    height = int(height.replace(',','').replace('m','').strip())
    #print(height)
    return height
  
  except:
    print(link+" player height not found!")
    return np.nan

### citizenship

In [21]:
def player_citizenship (page_content, link):
  try:
    citizenship = page_content.find(['span'], string=['Citizenship:']).find_next('span').text #all citizenships
    #print(citizanship.strip())
    citizenship = citizenship.strip().split('\xa0')
    return citizenship[0].strip() # first citizenship
  
  except:
    print(link+" player citizenship not found!")
    return np.nan

### position

In [22]:
def player_position (page_content, link):
  try:
    position = page_content.find(['span'], string=['Position:']).find_next('span').text
    #cleaning the text 
    position = position.replace('-',' ')
    position = (" ".join(position.strip().split()))
    #print(position)
    return position
  
  except:
    print(link+" player position not found!")
    return np.nan

### foot

In [23]:
def player_foot (page_content, link):
  try:
    foot = page_content.find(['span'], string=['Foot:']).find_next('span').text
    #print(foot.strip())
    return foot
  
  except:
    print(link+" player foot not found!")
    return np.nan

### player agent

In [24]:
def player_agent (page_content, link):
  try:
    player_agent = page_content.find(['span'], string=['Player agent:']).find_next('span').text
    #print(player_agent.strip())
    return player_agent.strip()
  
  except:
    print(link+" player agent not found!")
    return np.nan

### current club

In [25]:
def current_club(page_content, link):
  try:
    club_name = page_content.find('div', class_='data-header__club-info').find('span', class_='data-header__club').text
    #club_name = page.find('div', class_='data-header__club-info').find('span', class_='data-header__club').find('a')['title'] full name of club
    return club_name.strip()
  except:
    print(link+" current club not found!")
    return np.nan

### Detailed performance data

#### cast the table in website to pandas data frame to get more information

In [26]:
def detail_page_request(player_id):
  detail_link_format = 'https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/{}/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'
  #print (detail_link_format.format(id))
  detail_page=''
  try:
    detail_page = requests.get(detail_link_format.format(player_id), headers=headers)
    detail_page = BeautifulSoup(detail_page.content, 'html.parser') 
    return detail_page 
  except:
    print(detail_link_format.format(player_id)+" player detailed performance link not found!")
    return np.nan



In [11]:
def create_player_detial_dataframe (id): # id is for returning an eeror 
  detail_page = detail_page_request(id)
  detail_link_format = 'https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/{}/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'
  # find table in html
  try:
    table = detail_page.find('table', class_='items')
  except:
    print(detail_link_format.format(id)+" detail table not found!")

  # find column names
  column_values=['player_id']
  try:
    header = detail_page.find('table', class_='items').find('thead').find('tr').find_all('th')
    for th in header:
      if th['id'] == 'yw1_c2':
        continue
      try:
        column_values.append(th.find('a').find('span')['title'])
      except:
        column_values.append(th.text)
    #print(column_values)
  except:
    print(detail_link_format.format(id)+" table column_value found!")

  # creat data frame 
  detail_table= pd.DataFrame(columns = column_values)

  # find table rows and add to dataframe 
  try:
    rows = detail_page.find('table', class_='items').find('tbody').find_all('tr')
    for row in rows:
      table_row = []
      table_row.append(id)
      for section in row.find_all('td'):
        if section.text !='':
          table_row.append(section.text)
        else:
          try:
            table_row.append(section.a['title'])
          except:
            continue
      detail_table.loc[len(detail_table.index)]= table_row  
  except:
    print(detail_link_format.format(id)+" table row not found!")
  
  seasons = ['21/22', '20/21', '19/20', '18/19', '17/18', '16/17', '15/16']
  detail_2015_2021 = detail_table[detail_table['Season'].isin(seasons)] 

  return detail_2015_2021

  #find total row and add to dataframe
  '''
  total = []
  try:
    total_row = detail_page.find('table', class_='items').find('tfoot').find('tr').find_all('td')
    for row in total_row:
      try: 
        row['class']
        total.append(row.text)
      except:
        continue
  except:
    print(detail_link_format.format(id)+" total row not found!")
  detail_table.loc[len(detail_table.index)]= total
  '''
  #return detail_table

In [27]:
def detial_total_row(id):
  detail_page = detail_page_request(id)
  detail_link_format = 'https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/{}/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'
  total = []
  try:
    total_row = detail_page.find('table', class_='items').find('tfoot').find('tr').find_all('td')
    for row in total_row:
      try: 
        row['class']
        total.append(row.text)
      except:
        continue
  except:
    print(detail_link_format.format(id)+" total row not found!")
  return total


In [21]:
print(detial_total_row('17965'))

https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/17965/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1 player detailed performance link not found!
https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/17965/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1 total row not found!
[]


#### goals scored

In [28]:
def goals_scored (id):
  detail_link_format = 'https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/{}/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'
  try:
    return detial_total_row(id)[6]
  except:
    print(detail_link_format.format(id)+" goals scored not found!")
    return 'nan'


#### goals assisted

In [29]:
def goals_assisted (id):
  detail_link_format = 'https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/{}/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'
  try:
    return detial_total_row(id)[7]
  except:
    print(detail_link_format.format(id)+" goals assisted not found!")
    return 'nan'

#### total appearance

In [30]:
def total_appearance (id):
  detail_link_format = 'https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/{}/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'
  try:
    return detial_total_row(id)[4]
  except:
    print(detail_link_format.format(id)+" total appearance not found!")
    return 'nan'

#### goals_conceded

In [31]:
def goals_conceded (id):
  detail_link_format = 'https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/{}/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'
  try:
    return detial_total_row(id)[13]
  except:
    print(detail_link_format.format(id)+" goals conceded not found!")
    return 'nan'

#### clean_sheets

In [32]:
def clean_sheets (id):
  detail_link_format = 'https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/{}/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'
  try:
    return detial_total_row(id)[14]
  except:
    print(detail_link_format.format(id)+" clean sheats not found!")
    return 'nan'

#### yello cards

In [None]:
#print(total[11])

#### second yellow cards

In [None]:
#print (total[12])

#### red cards

In [None]:
#print (total[13])

### transfer history

In [44]:
# creating a dataframe with the transfer history table 
def player_transfer_history_dataframe(page_content, link):
  try:
    html_transfer_table = page_content.find('div', class_='grid tm-player-transfer-history-grid tm-player-transfer-history-grid tm-player-transfer-history-grid--heading')
  except:
    print(link+" transfer table not found!")

  # create columns
  transfer_table_column= []
  transfer_table_column = html_transfer_table.text.split()
  transfer_table_column.append('player_id')
  #print(transfer_table_column)

  #create rows
  transfer_table_rows = []
  try:
    html_transfer_table_rows = page_content.find_all('div', class_='grid tm-player-transfer-history-grid')
    for row in html_transfer_table_rows:
      row= row.text
      #delete extra spaces
      row = re.sub(' +', '', row)
      #divide columns by \n
      row = row.split('\n')
      #delete empty strings from list
      row = list(filter(None, row))
      row.append(player_id(link))
      
      #
      transfer_table_rows.append(row)
  except:
    print(link+" transfer table rows not found!")
  
  transfer_table = pd.DataFrame(transfer_table_rows, columns =transfer_table_column)
  seasons = ['21/22', '20/21', '19/20', '18/19', '17/18', '16/17', '15/16']
  transfer_2015_2021 = transfer_table[transfer_table['Season'].isin(seasons)] 
  return transfer_2015_2021

##Crawler header

In [33]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (HTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Accept-Language' : 'en-US,en;q=0.9'
}

## Player information

In [34]:
def player_info(page,link):
  info = []
  if player_position(page, link) == 'Goalkeeper':
    info.append(player_id(link))
    info.append(player_name(page, link))
    info.append(date_of_birth(page, link))
    info.append(player_height(page, link))
    info.append(player_citizenship(page, link))
    info.append(player_position(page, link))
    info.append(player_foot(page, link))
    info.append(current_club(page, link))
    info.append(goals_scored(player_id(link)))
    info.append(goals_assisted(player_id(link)))
    info.append(goals_conceded(player_id(link)))
    info.append(clean_sheets(player_id(link)))
    info.append(total_appearance(player_id(link)))
    info.append(player_agent(page, link))
  
  else: 
    info.append(player_id(link))
    info.append(player_name(page, link))
    info.append(date_of_birth(page, link))
    info.append(player_height(page, link))
    info.append(player_citizenship(page, link))
    info.append(player_position(page, link))
    info.append(player_foot(page, link))
    info.append(current_club(page, link))
    info.append(goals_scored(player_id(link)))
    info.append(goals_assisted(player_id(link)))
    info.append('nan')
    info.append('nan')
    info.append(total_appearance(player_id(link)))
    info.append(player_agent(page, link))
    
  return info

###Creating tables related to players

In [35]:
# player table
#player, transfer
def find_player_info(link):
  page = requests.get(link, headers=headers)
  time.sleep(2)
  page = BeautifulSoup(page.content, 'html.parser')
  #print(page.prettify)
  player = []
  player_club = []
  player_statistics= pd.DataFrame()
  transfer = pd.DataFrame()

  #player table
  player = player_info(page, link)
  
  #transfer table
  

  return player, player_transfer_history_dataframe(page, link) , create_player_detial_dataframe(player_id(link))


##The process of finding the link of the players' page

###1.Finding leagues of each season

In [36]:
# finding leagues of each season
def league_links (link):
  page = requests.get(link, headers=headers)
  page = BeautifulSoup(page.content, 'html.parser')
  #page.prettify
  leagues_link = []
  #filter leagues and cups that we want
  #leagues={'First Tier', 'Second Tier', 'Third Tier', 'Fourth Tier', 'Fifth Tier'}
  leagues={'First Tier'}
  tr_tags= page.find('table', class_= 'items').find('tbody').find_all('tr')
  flag = 2
  #for tr in tr_tags:

  for i, tr in enumerate(tr_tags):  
    #if i >1: 
      #break
    if(tr.find('td',class_='extrarow bg_blau_20 hauptlink') and tr.find('td',class_='extrarow bg_blau_20 hauptlink').text not in leagues):
      #print(tr.find('td',class_='extrarow bg_blau_20 hauptlink').text)
      flag =0
    elif (tr.find('td',class_='extrarow bg_blau_20 hauptlink') and tr.find('td',class_='extrarow bg_blau_20 hauptlink').text in leagues):
      flag = 1
    #if(tr.find('td', class_='hauptlink').find('tbody')):
    if(flag):
      try:
        lg_link = tr.find('td', class_='hauptlink').find('a')['href']
        #print(lg_link)
        leagues_link.append('https://www.transfermarkt.com'+lg_link)
      except:
        continue
  return leagues_link

###2.Finding teams of each league

In [37]:
#find teams of each league 
def team_links(link): #link of the league teams page
  page = requests.get(link, headers=headers)
  page = BeautifulSoup(page.content, 'html.parser')
  try:
    team_tag = page.find('table', class_='items').find('tbody').find_all('tr')
  except:
    print(link+ ' team table not found!')
  team_link =[]
  for i, tr in enumerate(team_tag):
    #if i>2:
     #break
    try:
      team_link.append('https://www.transfermarkt.com'+tr.find('td').find_next('td').find('a')['href'])
    except:
      print(link+ ' team link not found')
      team_link.append(np.nan)
  
  return team_link

###3.finding players of each team

In [38]:
# find players link by team link
def player_links(link): 
  page = requests.get(link, headers=headers)
  time.sleep(2)
  page = BeautifulSoup(page.content, 'html.parser')

  players_link=[]
  player_tag = page.find('table', class_='items').find('tbody').find_all('tr')
  for tr in player_tag:
    #if len(players_link)>9:
      #break
    try:
      player_link= tr.find('table', class_='inline-table').find('td', class_='hauptlink').find('div', class_='di nowrap').find('a')['href']
      players_link.append('https://www.transfermarkt.com'+ player_link)
    except:
      #print('player link not found')
      continue
    #print(player_name)
  
  return players_link


##Crawling main code

In [None]:
country_season_link_format = 'https://www.transfermarkt.com/wettbewerbe/national/wettbewerbe/{}/saison_id/{}/plus/1'
country_id = {'189', '50', '157', '40', '75'} #England, France, Spain, Germany, Italy
#country_id = {'189'} #England, France
season = {'2021', '2020', '2019', '2018', '2017', '2016', '2015'}

#link variables
all_season_leagues=[]
all_league_teams={}
all_team_players={}

#test variables
player_ids=[]
player_count=0

#final tables culomns & Each player's table
player_table_culomns = ['player_id', 'name', 'birth_date', 'height', 'current_international', 'main_position', 'foot', 'current_club', 'goals_scored', 'goals_assisted', 'goals_conceded', 'clean_sheets', 'total_appearence', 'agent']
transfer_table_culomns = ['player_id', 'Season', 'Date', 'Left', 'Joined', 'MV', 'Fee']

player = pd.DataFrame(columns = player_table_culomns)
transfer = pd.DataFrame(columns = transfer_table_culomns)
statistics = pd.DataFrame()

#final tables (all datas from players)
player_table = pd.DataFrame(columns = player_table_culomns)
transfer_table = pd.DataFrame(columns = transfer_table_culomns)
statistics_table = pd.DataFrame()


for c_id in tqdm(country_id, desc ="countries"):
  
  for i, s_id in enumerate(season):
    link = country_season_link_format.format(c_id, s_id)
    
    all_season_leagues=(league_links(link))
    #print('----- leagues links',all_season_leagues)
    
    for league_link in tqdm(all_season_leagues, desc='leagues'):
      #print(league_link)
      all_league_teams=(team_links(league_link))
      
      for team_link in tqdm(all_league_teams, desc='teams'):  
       
        all_team_players=(player_links(team_link))
        for player_link in (all_team_players):
        
          try:
            if player_id(player_link) not in player_ids:
              player_count+=1
              #print ('new id added','   ',player_count,'  ', player_id(player_link))
              player_ids.append(player_id(player_link))
              #player, transfer = find_player_info(player_link)
              player, transfer, statistics = find_player_info(player_link)
              
              player_table.loc[len(player_table.index)]= player
              transfer_table = pd.concat([transfer_table, transfer], axis=0,  ignore_index = True)
              statistics_table = pd.concat([statistics_table, statistics], axis=0,  ignore_index = True)

            else:
              print(player_link + ' duplicate id found and skiped')
            continue
          except:
            print(player_link + ' something wrong and skiped')           
            continue
          


player_table.to_csv('player_table.csv', encoding='utf-8')
transfer_table.to_csv('trasnfer_table.csv', encoding='utf-8')
statistics_table.to_csv('statistics_table.csv', encoding='utf-8')
#df.to_csv(file_name, sep='\t', encoding='utf-8')


countries:   0%|          | 0/5 [00:00<?, ?it/s]

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

teams:   0%|          | 0/20 [00:00<?, ?it/s]

https://www.transfermarkt.com/andrea-barzagli/profil/spieler/7109 player agent not found!
https://www.transfermarkt.com/alex-sandro/profil/spieler/79960 player agent not found!
https://www.transfermarkt.com/patrice-evra/profil/spieler/5285 player agent not found!
https://www.transfermarkt.com/paolo-de-ceglie/profil/spieler/44709 player agent not found!
https://www.transfermarkt.com/stefano-sturaro/profil/spieler/167859 player agent not found!
https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/93128/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1 total row not found!
https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/93128/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1 goals assisted not found!
https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/93128/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1 total row not found!
https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spiel

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

teams:   0%|          | 0/20 [00:00<?, ?it/s]

https://www.transfermarkt.com/wojciech-szczesny/profil/spieler/44058 duplicate id found and skiped
https://www.transfermarkt.com/emil-audero/profil/spieler/256339 duplicate id found and skiped
https://www.transfermarkt.com/mattia-perin/profil/spieler/110923 duplicate id found and skiped
https://www.transfermarkt.com/mattia-del-favero/profil/spieler/265079 duplicate id found and skiped
https://www.transfermarkt.com/leonardo-bonucci/profil/spieler/39983 duplicate id found and skiped
https://www.transfermarkt.com/daniele-rugani/profil/spieler/162959 duplicate id found and skiped
https://www.transfermarkt.com/medhi-benatia/profil/spieler/45124 duplicate id found and skiped
https://www.transfermarkt.com/giorgio-chiellini/profil/spieler/29260 duplicate id found and skiped
https://www.transfermarkt.com/andrea-barzagli/profil/spieler/7109 duplicate id found and skiped
https://www.transfermarkt.com/luca-marrone/profil/spieler/124769 duplicate id found and skiped
https://www.transfermarkt.com/lu

# Test area

In [39]:
def test (id): # id is for returning an eeror 
  detail_page = detail_page_request(id)
  detail_link_format = 'https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/{}/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1'
  # find table in html
  try:
    table = detail_page.find('table', class_='items')
  except:
    print(detail_link_format.format(id)+" detail table not found!")

  # find column names
  column_values=['player_id']
  try:
    header = detail_page.find('table', class_='items').find('thead').find('tr').find_all('th')
    for th in header:
      if th['id'] == 'yw1_c2':
        continue
      try:
        column_values.append(th.find('a').find('span')['title'])
      except:
        column_values.append(th.text)
    #print(column_values)
  except:
    print(detail_link_format.format(id)+" table column_value found!")

  # creat data frame 
  detail_table= pd.DataFrame(columns = column_values)

  # find table rows and add to dataframe 
  try:
    rows = detail_page.find('table', class_='items').find('tbody').find_all('tr')
    for row in rows:
      table_row = []
      table_row.append(id)
      for section in row.find_all('td'):
        if section.text !='':
          table_row.append(section.text)
        else:
          try:
            table_row.append(section.a['title'])
          except:
            continue
      detail_table.loc[len(detail_table.index)]= table_row  
  except:
    print(detail_link_format.format(id)+" table row not found!")

  return detail_table

In [61]:
id = ['357885', '120629']
table = pd.DataFrame()
for i in id:
  table = pd.concat([table, test(i)], axis=0)
table.head
table.to_csv('stat_test1.csv', encoding='utf-8')

## Creting Test .csv files 

In [40]:
# find the first two league of each season for test
def first_two_league (link):
  page = requests.get(link, headers=headers)
  page = BeautifulSoup(page.content, 'html.parser')
  #page.prettify
  leagues_link = []
  leagues={'First Tier'}
  tr_tags= page.find('table', class_= 'items').find('tbody').find_all('tr')
  flag = 2
  #for tr in tr_tags:

  for i, tr in enumerate(tr_tags):  
    #if i >1: 
      #break
    if(tr.find('td',class_='extrarow bg_blau_20 hauptlink') and tr.find('td',class_='extrarow bg_blau_20 hauptlink').text not in leagues):
      #print(tr.find('td',class_='extrarow bg_blau_20 hauptlink').text)
      flag =0
    elif (tr.find('td',class_='extrarow bg_blau_20 hauptlink') and tr.find('td',class_='extrarow bg_blau_20 hauptlink').text in leagues):
      flag = 1
    #if(tr.find('td', class_='hauptlink').find('tbody')):
    if(flag):
      try:
        lg_link = tr.find('td', class_='hauptlink').find('a')['href']
        #print(lg_link)
        leagues_link.append('https://www.transfermarkt.com'+lg_link)
      except:
        continue
  return leagues_link

In [41]:
#find the first three team of each league for test
def first_three_team(link): #link of the league teams page
  page = requests.get(link, headers=headers)
  page = BeautifulSoup(page.content, 'html.parser')
  try:
    team_tag = page.find('table', class_='items').find('tbody').find_all('tr')
  except:
    print(link+ ' team table not found!')
  team_link =[]
  for i, tr in enumerate(team_tag):
    if i>2:
      break

    try:
      team_link.append('https://www.transfermarkt.com'+tr.find('td').find_next('td').find('a')['href'])
    except:
      print(link+ ' team link not found')
      team_link.append(np.nan)
  
  return team_link

In [42]:
# find first 10 players link by team link
def first_ten_players(link): 
  page = requests.get(link, headers=headers)
  time.sleep(2)
  page = BeautifulSoup(page.content, 'html.parser')

  players_link=[]
  player_tag = page.find('table', class_='items').find('tbody').find_all('tr')
  for tr in player_tag:
    if len(players_link)>9:
      break
    try:
      player_link= tr.find('table', class_='inline-table').find('td', class_='hauptlink').find('div', class_='di nowrap').find('a')['href']
      players_link.append('https://www.transfermarkt.com'+ player_link)
    except:
      #print('player link not found')
      continue
    #print(player_name)
  
  return players_link


In [43]:
country_season_link_format = 'https://www.transfermarkt.com/wettbewerbe/national/wettbewerbe/{}/saison_id/{}/plus/1'
#country_id = {'189', '50', '157', '40', '75'} #England, France, Spain, Germany, Italy
country_id = {'189'} #England, France
season = {'2021'}

#link variables
all_season_leagues=[]
all_league_teams={}
all_team_players={}

#test variables
player_ids=[]
player_count=0

#final tables culomns & Each player's table
player_table_culomns = ['player_id', 'name', 'birth_date', 'height', 'current_international', 'main_position', 'foot', 'current_club', 'goals_scored', 'goals_assisted', 'goals_conceded', 'clean_sheets', 'total_appearence', 'agent']
transfer_table_culomns = ['player_id', 'Season', 'Date', 'Left', 'Joined', 'MV', 'Fee']

player = pd.DataFrame(columns = player_table_culomns)
transfer = pd.DataFrame(columns = transfer_table_culomns)
statistics= pd.DataFrame()

#final tables (all datas from players)
player_table = pd.DataFrame(columns = player_table_culomns)
transfer_table = pd.DataFrame(columns = transfer_table_culomns)
statistics_table= pd.DataFrame()


for c_id in tqdm(country_id, desc ="countries"):
  
  for i, s_id in enumerate(season):
    link = country_season_link_format.format(c_id, s_id)
    
    all_season_leagues=(first_two_league(link))
    #print('----- leagues links',all_season_leagues)
    
    for league_link in tqdm(all_season_leagues, desc='leagues'):
      #print(league_link)
      all_league_teams=(first_three_team(league_link))
      
      for team_link in tqdm(all_league_teams, desc='teams'):  
       
        all_team_players=(first_ten_players(team_link))
        for player_link in (all_team_players):
        
          try:
            if player_id(player_link) not in player_ids:
              player_count+=1
              #print ('new id added','   ',player_count,'  ', player_id(player_link))
              player_ids.append(player_id(player_link))
              player, transfer, statistics = find_player_info(player_link)
              
              player_table.loc[len(player_table.index)]= player
              transfer_table = pd.concat([transfer_table, transfer], axis=0)
              statistics_table= pd.concat([statistics_table, statistics], axis=0)
            else:
              print(player_link + ' duplicate id found and skiped')
            continue
          except:
            print(player_link + ' something wrong and skiped')           
            continue
          


player_table.to_csv('player_table.csv', encoding='utf-8')
transfer_table.to_csv('trasnfer_table.csv', encoding='utf-8')
statistics_table.to_csv('statistics_table V2.csv', encoding='utf-8')
#df.to_csv(file_name, sep='\t', encoding='utf-8')


countries:   0%|          | 0/1 [00:00<?, ?it/s]

leagues:   0%|          | 0/1 [00:00<?, ?it/s]

teams:   0%|          | 0/3 [00:00<?, ?it/s]

https://www.transfermarkt.com/cieran-slicker/profil/spieler/621997 player height not found!
https://www.transfermarkt.com/cieran-slicker/profil/spieler/621997 player foot not found!
https://www.transfermarkt.com/aymeric-laporte/profil/spieler/176553 player agent not found!
https://www.transfermarkt.com/cj-egan-riley/profil/spieler/581669 player agent not found!
https://www.transfermarkt.com/marcelo-pitaluga/profil/spieler/662334 player agent not found!
https://www.transfermarkt.com/ruben-dias/leistungsdatendetails/spieler/494284/saison//verein/0/liga/0/wettbewerb//pos/0/trainer_id/0/plus/1 table row not found!
https://www.transfermarkt.com/kurt-zouma/profil/spieler/157509 player agent not found!


## test area for test tables