In [13]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [3]:
# Necessary to access transfermarkt
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

### Helper function

In [7]:
#There are 3 different types, or more?, of what can happen in the 'status' col, so to avoid errors and nans
#I create a helper function to consider all cases
def get_7th_cell_value(cell):
    
    #Normal games (played)
    if cell.find('a') != None:
        return cell.find('a').string

    #Injuries
    if cell.find('span') != None:
        return cell.find('span')['title']

    #Normal games (benched) + Suspensions (by court)
    if cell.string != None:
        return cell.string

# Workflow

Three types of transfermarkt pages used: <br>
1. Table for the season, has all the teams listed. https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1/plus/?saison_id=2018
2. A teams page for a given season, has all the players. https://www.transfermarkt.com/juventus-turin/startseite/verein/506/saison_id/2018
3. Players performance in Serie A for a given season, has all the details. https://www.transfermarkt.com/dupa/leistungsdatendetails/spieler/5023/saison/2017/verein/0/liga/0/wettbewerb/IT1/pos/0/trainer_id/0/plus/1

In [None]:
start = time.time()

global error_no_tables
global error_machday
error_no_tables = 0
error_extraction = 0
error_table = []

master_table = []

#LOOP FOR SEASON -> get teams
for season in ['2015','2016','2017','2018','2019','2020']:
    print('SEASON: ', season)
    season_url = 'https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1/plus/?saison_id=' + season
    req  = requests.get(season_url, headers=headers)
    cont = req.content
    soup = BeautifulSoup(cont, 'html.parser')
    
    #Get link to team page for each team in Serie A in this season
    team_name_cells = soup.find_all('td', attrs={'class':'hauptlink no-border-links hide-for-small hide-for-pad'})
    
    #href only stores extension to main url, need to add that manualy
    team_links = []
    for line in team_name_cells:
        team_links.append(line.find('a')['href'])
    team_links = ['https://www.transfermarkt.com'+link for link in team_links]

    #LOOP FOR TEAM -> get players
    for team_url in team_links:
        req  = requests.get(team_url, headers=headers)
        cont = req.content
        soup = BeautifulSoup(cont, 'html.parser')

        #Get team name
        team_name = soup.find('h1', attrs={'itemprop':'name'}).find('span').string

        #Entire table -> body (no header) -> list of players (still as html)
        main_table = soup.find('table', attrs={'class':'items'})
        body = main_table.find('tbody')
        player_name_cell = body.find_all('table', attrs={'class':'inline-table'})

        #Extract link for each player from html
        player_links = []
        for player in player_name_cell:
            link_temp = player.find('td', attrs={'class':'hauptlink'}).find('a')['href']
            player_links.append(link_temp)

        #Extract player id from link (we need id to acces their individual page)
        player_ids = [pl.split('/')[-1] for pl in player_links]


        #LOOP FOR PLAYERS -> get end info (injuries, teams, etc.)
        for player_id in player_ids:
            player_url = f'https://www.transfermarkt.com/placeholder/leistungsdatendetails/spieler/{player_id}/saison/{season}/verein/0/liga/0/wettbewerb/IT1/pos/0/trainer_id/0/plus/1'
            req  = requests.get(player_url, headers=headers)
            cont = req.content
            soup = BeautifulSoup(cont, 'html.parser')

            #player name
            player_name = soup.find('h1', attrs={'itemprop':'name'}).text
            print(player_name)

            try:
                #All tables on page -> main table -> body -> list of rows (with all info, still as html)
                tables = soup.find_all('div', attrs={'class':'responsive-table'})
                main_table = tables[1]
                body = main_table.find('tbody')
                rows = body.find_all('tr')
            except:
                print('^^^^^^^^ Guy has no tables ^^^^^^^^')
                error_no_tables += 1
                error_table.append([player_name, team_name, season, 'no_tables'])
                continue

            game_info = []
            for row in rows:
                try:
                    cells = row.find_all('td') #splits row into cells
                    match_day = cells[0].find('a').string
                    date      = cells[1].string
                    home_team = cells[3].find('a').string
                    away_team = cells[5].find('a').string
                    score     = cells[6].string.strip()
                    status    = get_7th_cell_value(cells[7]) #this cell holds injuries, suspensions, benched etc.
                                                             #but also position if the player played in game
                    game_info.append([player_name, team_name, match_day, date, home_team, away_team,score, status, season])
                except:
                    error_extraction += 1
                    error_table.append([player_name, team_name, season, 'extraction_error'])
                    print('^^^^^^^^ Extraction error ^^^^^^^^')

            master_table.extend(game_info)


end = time.time()
print('\nProcessing time: ', end - start)
print('Missing table errors: ', error_no_tables)
print('Extraction errors: ', error_extraction)


# Export the data

In [55]:
df = pd.DataFrame(master_table, columns=['player', 'team', 'matchweek', 'match_date', 'home_team', 'away_team',
                                        'score', 'status', 'season'])
errors = pd.DataFrame(error_table, columns=['player', 'team', 'season', 'error_type'])

In [65]:
df.head(2)

Unnamed: 0,player,team,matchweek,match_date,home_team,away_team,score,status,season
0,Neto,Juventus FC,1,"Aug 23, 2015",Juventus,Udinese Calcio,0:1,on the bench,2015
1,Neto,Juventus FC,2,"Aug 30, 2015",AS Roma,Juventus,2:1,on the bench,2015


In [77]:
df.to_csv('transfermarkt_data.csv', index=False)
errors.to_csv('errors_table.csv', index=False)