# Crawling Whoscored Web Site
- https://www.whoscored.com/

In [1]:
# import package
import pandas as pd
import time
from selenium import webdriver

In [2]:
api_delay_term = 5

In [3]:
def str_to_zero(point):
    return 0 if point == "-" else point

In [4]:
# cawling league & team id
# parameter : team id
# return : league teams & team id
def crawling_league_teams(team_id):
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver =  webdriver.PhantomJS()
    driver.get(url)
    time.sleep(api_delay_term)
    team_names = driver.find_elements_by_css_selector("#teams option")
    
    df = pd.DataFrame(columns=["team_id","name"])
    
    for team_name in team_names:
        name = team_name.text 
        team_id = team_name.get_attribute("value").split("/")[2]
        tmp_dict = {"team_id":team_id, "name":name }
        df.loc[len(df)] = tmp_dict
    return df

# function test code
league_teams_id = crawling_league_teams(52)
league_teams_id.head()

# 13 : Arsenal : Premier League
# 87 : Juventus : Serie A
# 52 : Real Madrid : La Liga

Unnamed: 0,team_id,name
0,53,Athletic Bilbao
1,63,Atletico Madrid
2,65,Barcelona
3,62,Celta Vigo
4,59,Deportivo La Coruna


In [33]:
# team players summary list crawling
# parameter : team id
# return : pandas DataFrame : player summary features
def crawling_players_summary(team_id):
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver =  webdriver.PhantomJS()
    driver.get(url)
    time.sleep(api_delay_term)
    elements = driver.find_elements_by_css_selector("#player-table-statistics-body tr")
    
    df = pd.DataFrame(columns=[
            'player_number', 'flag', 'name', 'age', 'position', 'tall', 'weight', 'full_time', 'half_time', 'mins', 
            'goals', 'asists', 'yel', 'red', 'spg', 'ps', 'motm', 'aw', 'rating',
        ])
    
    for element in elements:

        games = element.find_elements_by_css_selector("td")[5].text
        games = games.split("(")
        full_time, half_time = games[0], 0
        if len(games) > 1 :
            half_time = games[1].replace(")","")
        else :
            half_time = 0

        tmp_dict = { 
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "flag": element.find_elements_by_css_selector("td")[1].find_elements_by_css_selector("span")[0].get_attribute("class").split("-")[2],
            "name": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].text, 
            "age": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("span")[0].text, 
            "position": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("span")[1].text[1:], 
            "tall": element.find_elements_by_css_selector("td")[3].text,
            "weight": element.find_elements_by_css_selector("td")[4].text, 
            "full_time": full_time,
            "half_time": half_time,
            "mins": str_to_zero(element.find_elements_by_css_selector("td")[6].text),
            "goals": str_to_zero(element.find_elements_by_css_selector("td")[7].text),
            "asists": str_to_zero(element.find_elements_by_css_selector("td")[8].text),
            "yel": str_to_zero(element.find_elements_by_css_selector("td")[9].text),
            "red": str_to_zero(element.find_elements_by_css_selector("td")[10].text),
            "spg": str_to_zero(element.find_elements_by_css_selector("td")[11].text),
            "ps": str_to_zero(element.find_elements_by_css_selector("td")[12].text),
            "aw": str_to_zero(element.find_elements_by_css_selector("td")[13].text),
            "motm": str_to_zero(element.find_elements_by_css_selector("td")[14].text),
            "rating": str_to_zero(element.find_elements_by_css_selector("td")[15].text),
        }
        df.loc[len(df)] = tmp_dict
    return df

# function test code
# 52 : Real Madrid
players_summary_df = crawling_players_summary(52)
players_summary_df.head()

Unnamed: 0,player_number,flag,name,age,position,tall,weight,full_time,half_time,mins,goals,asists,yel,red,spg,ps,motm,aw,rating
0,13812,gb,Gareth Bale,26,"D(C),M(CLR),FW",183,74,21,2,1741,19,10,2,0,3.5,79.9,5,0.9,8.12
1,5583,pt,Cristiano Ronaldo,31,"M(CL),FW",185,80,36,0,3185,35,11,3,0,6.3,79.4,8,1.6,7.99
2,14296,fr,Karim Benzema,28,"M(CLR),FW",187,79,26,1,1994,24,7,1,0,3.6,81.5,3,0.4,7.65
3,20241,br,Marcelo,28,"D(CL),M(C)",174,75,28,2,2474,2,3,2,0,0.6,84.3,2,0.2,7.44
4,20874,hr,Luka Modric,30,"M(CR),FW",174,65,31,1,2629,2,4,5,0,1.1,90.9,6,0.3,7.37


In [34]:
# team players defensive list crawling
# parameter : team id
# return : pandas DataFrame : player defensive features
def crawling_player_defensive(team_id):
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = webdriver.PhantomJS()
    driver.get(url)
    time.sleep(api_delay_term)
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[0].find_element_by_css_selector("a").click()
    time.sleep(api_delay_term)
    elements = driver.find_elements_by_css_selector("#team-squad-stats-defensive #player-table-statistics-body tr")

    df = pd.DataFrame(columns=[
            "player_number", "tackles", "inter", "fouls", "offsides", "clear", "drb", "blocks", "owng"
        ])

    for element in elements:
       
        tmp_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "tackles": str_to_zero(element.find_elements_by_css_selector("td")[7].text), 
            "inter": str_to_zero(element.find_elements_by_css_selector("td")[8].text), 
            "fouls": str_to_zero(element.find_elements_by_css_selector("td")[9].text),
            "offsides": str_to_zero(element.find_elements_by_css_selector("td")[10].text),
            "clear": str_to_zero(element.find_elements_by_css_selector("td")[11].text),
            "drb": str_to_zero(element.find_elements_by_css_selector("td")[12].text),
            "blocks": str_to_zero(element.find_elements_by_css_selector("td")[13].text),
            "owng": str_to_zero(element.find_elements_by_css_selector("td")[14].text),
        }
        
        df.loc[len(df)] = tmp_dict
    return df

# function test code
# 52 : Real Madrid
player_defensive_df = crawling_player_defensive(52)
player_defensive_df.head()

Unnamed: 0,player_number,tackles,inter,fouls,offsides,clear,drb,blocks,owng
0,88526,3.1,2.0,1.7,0.0,1.4,1.6,0.4,0.0
1,88300,2.9,2.3,1.5,0.2,1.7,0.7,0.1,0.0
2,20241,2.5,1.8,0.5,0.2,0.9,1.1,0.2,0.0
3,144511,2.1,1.0,0.6,0.0,0.3,0.6,0.0,0.0
4,11104,2.0,1.5,1.0,0.2,1.3,0.5,0.2,0.0


In [35]:
# team players offensive list crawling
# parameter : team id
# return : pandas DataFrame : player offensive features
def crawling_player_offensive(team_id):
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = webdriver.PhantomJS()
    driver.get(url)
    time.sleep(api_delay_term)
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[1].find_element_by_css_selector("a").click()
    time.sleep(api_delay_term)
    elements = driver.find_elements_by_css_selector("#statistics-table-offensive #player-table-statistics-body tr")

    df = pd.DataFrame(columns=[
            "player_number", "keyp", "fouled", "off", "disp", "unstch"
        ])

    for element in elements:
       
        tmp_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "keyp": str_to_zero(element.find_elements_by_css_selector("td")[10].text), 
            "fouled": str_to_zero(element.find_elements_by_css_selector("td")[12].text), 
            "off": str_to_zero(element.find_elements_by_css_selector("td")[13].text),
            "disp": str_to_zero(element.find_elements_by_css_selector("td")[14].text),
            "unstch": str_to_zero(element.find_elements_by_css_selector("td")[15].text),
        }
        
        df.loc[len(df)] = tmp_dict
    return df

# function test code
# 52 : Real Madrid
player_offensive_df = crawling_player_offensive(52)
player_offensive_df.head()

Unnamed: 0,player_number,keyp,fouled,off,disp,unstch
0,5583,1.4,1.1,1.4,1.2,1.2
1,14296,1.6,0.8,0.7,1.3,1.3
2,13812,2.2,1.2,0.7,0.8,1.3
3,71182,2.3,0.7,0.2,0.9,1.0
4,106028,0.8,0.4,0.0,0.4,0.5


In [36]:
# team players passing list crawling
# parameter : team id
# return : pandas DataFrame : player passing features
def crawling_player_passing(team_id):
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = webdriver.PhantomJS()
    driver.get(url)
    time.sleep(api_delay_term)
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[2].find_element_by_css_selector("a").click()
    time.sleep(api_delay_term)
    elements = driver.find_elements_by_css_selector("#statistics-table-passing #player-table-statistics-body tr")

    df = pd.DataFrame(columns=[
            "player_number", "avgp", "ps", "crosses", "longb", "thrb"
        ])

    for element in elements:
       
        tmp_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "avgp": str_to_zero(element.find_elements_by_css_selector("td")[8].text), 
            "ps": str_to_zero(element.find_elements_by_css_selector("td")[9].text), 
            "crosses": str_to_zero(element.find_elements_by_css_selector("td")[10].text),
            "longb": str_to_zero(element.find_elements_by_css_selector("td")[11].text),
            "thrb": str_to_zero(element.find_elements_by_css_selector("td")[12].text),
        }
        
        df.loc[len(df)] = tmp_dict
    return df

# function test code
# 52 : Real Madrid
player_passing_df = crawling_player_passing(52)
player_passing_df.head()

Unnamed: 0,player_number,avgp,ps,crosses,longb,thrb
0,5583,1.4,29.7,79.4,0.3,0.3
1,13812,2.2,30.3,79.9,0.8,1.3
2,31772,1.8,75.8,93.9,1.2,7.3
3,71182,2.3,41.4,86.9,1.5,2.0
4,82989,1.4,38.1,89.2,0.7,1.2
