In [1]:
from bs4 import BeautifulSoup
from itertools import chain
from selenium import webdriver  
from selenium.common.exceptions import NoSuchElementException  
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.common.keys import Keys

import os
import time
import pandas as pd
import pickle
import re

chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

# Generate a List of Links to Scrape Individual Matches

Fixture List Scraping Function

In [2]:
def get_match_links(fixtures_url):
    """takes a url for a single season fixture list 
    and generates a list of links to the individual matches for the league"""
    browser = webdriver.Chrome(chromedriver)
    browser.get(fixtures_url)
    match_links = []

    html_source = browser.page_source
    soup = BeautifulSoup(html_source, 'html.parser')
    matches = soup.find_all(class_='result-1 rc')
    for el in range(len(matches)):
        match_links.append(matches[el]['href']) #appends url for given match to match_links

    time.sleep(15) 

    for i in range(9):
        browser.find_element_by_xpath('//a[@title="View previous month"]').click()
        time.sleep(5) 

        html_source = browser.page_source
        soup = BeautifulSoup(html_source, 'html.parser')
        matches = soup.find_all(class_='result-1 rc')
        for el in range(len(matches)):
            match_links.append(matches[el]['href']) #appends url for given match to match_links
            
    list(set(match_links)) #drop duplicates
    match_links.sort()
    browser.quit()
    return match_links

**Generate Fixture Lists for England, Spain , Germany 2015/16**

In [3]:
links_england = get_match_links("https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/5826/Stages/12496/Fixtures/England-Premier-League-2015-2016")

links_spain = get_match_links("https://www.whoscored.com/Regions/206/Tournaments/4/Seasons/5933/Stages/12647/Fixtures/Spain-La-Liga-2015-2016")

links_germany = get_match_links("https://www.whoscored.com/Regions/81/Tournaments/3/Seasons/5870/Stages/12559/Fixtures/Germany-Bundesliga-2015-2016")

**Pickle Fixture Lists**

In [4]:
with open('links_england.pkl', 'wb') as picklefile:
    pickle.dump(links_england, picklefile)

with open('links_spain.pkl', 'wb') as picklefile:
    pickle.dump(links_spain, picklefile)

with open('links_germany.pkl', 'wb') as picklefile:
    pickle.dump(links_germany, picklefile)

# Create Match DataFrame / Scrape Individual Matches

Create DataFrame with Features to Consider

In [5]:
df = pd.DataFrame(columns=('link','goals_for','goals_against','shots_for','shots_against','post_for','post_against','on_target_for','on_target_against','off_target_for','off_target_against','blocked_for','blocked_against','possession%_for','possession%_against','touches_for','touches_against','pass_success%_for','pass_success%_against','total_passes_for','total_passes_against','completed_passes_for','completed_passes_against','key_passes_for','key_passes_against','dribbles_won_for','dribbles_won_against','dribbles_attempted_for','dribbles_attempted_against','dribbled_past_for','dribbled_past_against','dribble_success%_for','dribble_success%_against','aerials_won_for','aerials_won_against','aerials_won%_for','aerials_won%_against','offensive_aerials_for','offensive_aerials_against','defensive_aerials_for','defensive_aerials_against','successful_tackles_for','successful_tackles_against','tackles_attempted_for','tackles_attempted_against','was_dribbled_for','was_dribbled_against','tackle_success%_for','tackle_success%_against','clearances_for','clearances_against','interceptions_for','interceptions_against','corners_for','corners_against','corner_accuracy%_for','corner_accuracy%_against','dispossessed_for','dispossessed_against','errors_for','errors_against','fouls_for','fouls_against','total_saves_for','total_saves_against','collected_for','collected_against','parried_save_for','parried_save_against','parried_danger_for','parried_danger_against'))

**Match Scraping Function**

In [6]:
def get_match_stats():
    """returns match stats from current Beautiful Soup object"""
    match = []
    match.append(soup.find(class_="result").get_text()[0]) #Home Goals - assumes single digit goal figure
    match.append(soup.find(class_="result").get_text()[4]) #Away Goals ...
    for s in (chain(range(30), range(62,66))):
        match.append(stats[s].find_all(class_='match-centre-stat-value')[0].get_text()) #Home Stat
        match.append(stats[s].find_all(class_='match-centre-stat-value')[1].get_text()) #Away ...
    return match

In [7]:
match = []

**Scrape Entire Fixture List for Match Stats**

England

In [8]:
browser = webdriver.Chrome(chromedriver)
for i in range(len(links_england)):
    browser.get("https://www.whoscored.com/"+links_england[i])
    html_source = browser.page_source

    soup = BeautifulSoup(html_source, 'html.parser')
    matches = soup.find_all(class_='result-1 rc')
    stats = soup.find_all(class_='match-centre-stat match-centre-sub-stat')
    df.loc[i] = get_match_stats()
    
browser.quit()

In [9]:
with open('match_frame_england.pkl', 'wb') as picklefile:
    pickle.dump(df, picklefile)

Spain

In [10]:
browser = webdriver.Chrome(chromedriver)
for i in range(len(links_spain)):
    browser.get("https://www.whoscored.com/"+links_spain[i])
    html_source = browser.page_source

    soup = BeautifulSoup(html_source, 'html.parser')
    matches = soup.find_all(class_='result-1 rc')
    stats = soup.find_all(class_='match-centre-stat match-centre-sub-stat')
    df2.loc[i] = get_match_stats()
    
browser.quit()

In [11]:
with open('match_frame_spain.pkl', 'wb') as picklefile:
    pickle.dump(df2, picklefile)

Germany

In [12]:
browser = webdriver.Chrome(chromedriver)
for i in range(len(links_germany)):
    browser.get("https://www.whoscored.com/"+links_germany[i])
    html_source = browser.page_source

    soup = BeautifulSoup(html_source, 'html.parser')
    matches = soup.find_all(class_='result-1 rc')
    stats = soup.find_all(class_='match-centre-stat match-centre-sub-stat')
    df3.loc[i] = get_match_stats()
    
browser.quit()

In [13]:
with open('match_frame_germany.pkl', 'wb') as picklefile:
    pickle.dump(df3, picklefile)