# Crawling Whoscored Web Site
- https://www.whoscored.com/

In [1]:
# import package
import pandas as pd
import time
from selenium import webdriver

In [2]:
api_delay_term = 5

In [3]:
def str_to_zero(point):
    return 0 if point == "-" else point

In [4]:
# cawling league & team id
# parameter : team id
# return : league teams & team id
def crawling_league_teams(team_id):
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver =  webdriver.PhantomJS()
    driver.get(url)
    time.sleep(api_delay_term)
    team_names = driver.find_elements_by_css_selector("#teams option")
    
    df = pd.DataFrame(columns=["team_id","team_name"])
    
    for team_name in team_names:
        name = team_name.text 
        team_id = team_name.get_attribute("value").split("/")[2]
        tmp_dict = {"team_id":team_id, "team_name":name }
        df.loc[len(df)] = tmp_dict
    driver.close()
    return df

In [5]:
# team players summary list crawling
# parameter : team id
# return : pandas DataFrame : player summary features
def crawling_player_summary(team_id):
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver =  webdriver.PhantomJS()
    driver.get(url)
    time.sleep(api_delay_term)
    elements = driver.find_elements_by_css_selector("#player-table-statistics-body tr")
    
    df = pd.DataFrame(columns=[
            'player_number', 'flag', 'name', 'age', 'position', 'tall', 'weight', 'full_time', 'half_time', 'mins', 
            'goals', 'asists', 'yel', 'red', 'spg', 'ps', 'motm', 'aw', 'rating',
        ])
    
    for element in elements:

        games = element.find_elements_by_css_selector("td")[5].text
        games = games.split("(")
        full_time, half_time = games[0], 0
        if len(games) > 1 :
            half_time = games[1].replace(")","")
        else :
            half_time = 0

        tmp_dict = { 
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "flag": element.find_elements_by_css_selector("td")[1].find_elements_by_css_selector("span")[0].get_attribute("class").split("-")[2],
            "name": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].text, 
            "age": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("span")[0].text, 
            "position": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("span")[1].text[1:], 
            "tall": element.find_elements_by_css_selector("td")[3].text,
            "weight": element.find_elements_by_css_selector("td")[4].text, 
            "full_time": full_time,
            "half_time": half_time,
            "mins": str_to_zero(element.find_elements_by_css_selector("td")[6].text),
            "goals": str_to_zero(element.find_elements_by_css_selector("td")[7].text),
            "asists": str_to_zero(element.find_elements_by_css_selector("td")[8].text),
            "yel": str_to_zero(element.find_elements_by_css_selector("td")[9].text),
            "red": str_to_zero(element.find_elements_by_css_selector("td")[10].text),
            "spg": str_to_zero(element.find_elements_by_css_selector("td")[11].text),
            "ps": str_to_zero(element.find_elements_by_css_selector("td")[12].text),
            "aw": str_to_zero(element.find_elements_by_css_selector("td")[13].text),
            "motm": str_to_zero(element.find_elements_by_css_selector("td")[14].text),
            "rating": str_to_zero(element.find_elements_by_css_selector("td")[15].text),
        }
        df.loc[len(df)] = tmp_dict
    driver.close()
    return df

In [6]:
# team players defensive list crawling
# parameter : team id
# return : pandas DataFrame : player defensive features
def crawling_player_defensive(team_id):
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = webdriver.PhantomJS()
    driver.get(url)
    time.sleep(api_delay_term)
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[0].find_element_by_css_selector("a").click()
    time.sleep(api_delay_term)
    elements = driver.find_elements_by_css_selector("#team-squad-stats-defensive #player-table-statistics-body tr")

    df = pd.DataFrame(columns=[
            "player_number", "tackles", "inter", "fouls", "offsides", "clear", "drb", "blocks", "owng"
        ])

    for element in elements:
       
        tmp_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "tackles": str_to_zero(element.find_elements_by_css_selector("td")[7].text), 
            "inter": str_to_zero(element.find_elements_by_css_selector("td")[8].text), 
            "fouls": str_to_zero(element.find_elements_by_css_selector("td")[9].text),
            "offsides": str_to_zero(element.find_elements_by_css_selector("td")[10].text),
            "clear": str_to_zero(element.find_elements_by_css_selector("td")[11].text),
            "drb": str_to_zero(element.find_elements_by_css_selector("td")[12].text),
            "blocks": str_to_zero(element.find_elements_by_css_selector("td")[13].text),
            "owng": str_to_zero(element.find_elements_by_css_selector("td")[14].text),
        }
        
        df.loc[len(df)] = tmp_dict
    driver.close()
    return df

In [7]:
# team players offensive list crawling
# parameter : team id
# return : pandas DataFrame : player offensive features
def crawling_player_offensive(team_id):
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = webdriver.PhantomJS()
    driver.get(url)
    time.sleep(api_delay_term)
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[1].find_element_by_css_selector("a").click()
    time.sleep(api_delay_term)
    elements = driver.find_elements_by_css_selector("#statistics-table-offensive #player-table-statistics-body tr")

    df = pd.DataFrame(columns=[
            "player_number", "keyp", "fouled", "off", "disp", "unstch"
        ])

    for element in elements:
       
        tmp_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "keyp": str_to_zero(element.find_elements_by_css_selector("td")[10].text), 
            "fouled": str_to_zero(element.find_elements_by_css_selector("td")[12].text), 
            "off": str_to_zero(element.find_elements_by_css_selector("td")[13].text),
            "disp": str_to_zero(element.find_elements_by_css_selector("td")[14].text),
            "unstch": str_to_zero(element.find_elements_by_css_selector("td")[15].text),
        }
        
        df.loc[len(df)] = tmp_dict
    driver.close()
    return df

In [8]:
# team players passing list crawling
# parameter : team id
# return : pandas DataFrame : player passing features
def crawling_player_passing(team_id):
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver = webdriver.PhantomJS()
    driver.get(url)
    time.sleep(api_delay_term)
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[2].find_element_by_css_selector("a").click()
    time.sleep(api_delay_term)
    elements = driver.find_elements_by_css_selector("#statistics-table-passing #player-table-statistics-body tr")

    df = pd.DataFrame(columns=[
            "player_number", "avgp", "ps", "crosses", "longb", "thrb"
        ])

    for element in elements:
       
        tmp_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "avgp": str_to_zero(element.find_elements_by_css_selector("td")[8].text), 
            "ps": str_to_zero(element.find_elements_by_css_selector("td")[9].text), 
            "crosses": str_to_zero(element.find_elements_by_css_selector("td")[10].text),
            "longb": str_to_zero(element.find_elements_by_css_selector("td")[11].text),
            "thrb": str_to_zero(element.find_elements_by_css_selector("td")[12].text),
        }
        
        df.loc[len(df)] = tmp_dict
    driver.close()
    return df

In [9]:
# 13 : Arsenal : Premier League
# 87 : Juventus : Serie A
# 52 : Real Madrid : La Liga
# 37 : Bayern Munich : Bundesliga

# crawling league team data

# LaLiga = crawling_league_teams(52)
# SerieA = crawling_league_teams(87)
# PremierLeague = crawling_league_teams(13)
# Bundesliga = crawling_league_teams(37)

# LaLiga.to_csv("./league/LaLiga.csv", index=False)
# SerieA.to_csv("./league/SerieA.csv", index=False)
# PremierLeague.to_csv("./league/PremierLeague.csv", index=False)
# Bundesliga.to_csv("./league/Bundesliga.csv", index=False)

In [10]:
# crawling & merge player datas
def make_players_info(team_id, team_name):
    print(team_id, team_name)
    player_summary_df = crawling_player_summary(team_id)
    player_defensive_df = crawling_player_defensive(team_id)
    player_offensive_df = crawling_player_offensive(team_id)
    player_passing_df = crawling_player_passing(team_id)
    print("player crawling DONE")
    
    sd = player_summary_df.merge(player_defensive_df, on="player_number")
    sdo = sd.merge(player_offensive_df, on="player_number")
    sdop = sdo.merge(player_passing_df, on="player_number")
    
    sdop["team_name"] = team_name
    
    print(len(sdop))
    
    return sdop

In [11]:
def save_player_info(league):
    df = pd.read_csv("./league/" + league + ".csv")
    for idx, row in df.iterrows():
        players_df = make_players_info(row.team_id, row.team_name)
        players_df.to_csv("./player/" + league + "/" + row.team_name + ".csv")
        print(row.team_name) 
    print(league + "Done")

In [12]:
# 팀별로 저장

# PremierLeague, SerieA, LaLiga, Bundesliga
league = "Bundesliga"
df = pd.read_csv("./league/" + league + ".csv")
df

Unnamed: 0,team_id,team_name
0,1730,Augsburg
1,36,Bayer Leverkusen
2,37,Bayern Munich
3,44,Borussia Dortmund
4,134,Borussia M.Gladbach
5,1147,Darmstadt
6,45,Eintracht Frankfurt
7,282,FC Cologne
8,38,Hamburger SV
9,110,Hannover 96


In [17]:
# 42 Werder Bremen
for idx, row in df[17:].iterrows():
    print(league, row.team_id, row.team_name)
    tmp_def(league, row.team_id, row.team_name)

Bundesliga 33 Wolfsburg
33 Wolfsburg
player crawling DONE
29
excute_time : 84.28630304336548


In [13]:
def tmp_def(league, team_id, team_name):
    start_time = time.time()
    player_df = make_players_info(team_id, team_name)
    print("excute_time : {time}".format(time=(time.time() - start_time)))
    player_df["league"] = league
    
    path = "./player/" + league + "/" + team_name + ".csv"
    player_df.to_csv(path, index=False)
    
    tmp_df = pd.read_csv(path)

In [None]:
# PremierLeague, SerieA, LaLiga, Bundesliga
league = "PremierLeague"
team_id = 14
team_name = "Leicester"

start_time = time.time()
player_df = make_players_info(team_id, team_name)
print("excute_time : {time}".format(time=(time.time() - start_time)))
player_df["league"] = league

In [None]:
path = "./player/" + league + "/" + team_name + ".csv"
player_df.to_csv(path, index=False)

In [None]:
tmp_df = pd.read_csv(path)
tmp_df.head()