In [1]:
import urllib.request
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import re

In [2]:
# code to retrieve player links from epl website (seasons 0809 - 1617)

# urlopen wasn't working because the ascii codec that urllib uses to interpret urls didn't understand
# non-ascii characters (such as accented letters). To solve this problem, I used urllib's quote() function,
# which replaces special characters in the string with the hex %HH which is understood by ascii, and will address
# the correct webpage. The 'safe' characters indicate which characters shouldn't be encoded in the way described.
# It was necessary to encode the url to utf-8 beforehand because Python's default string format is Unicode.

# we have the htmls for seasons 0809 - 1617
player_lists_filepaths = ['./data/epl/epl_{:0>2}{:0>2}_players.html'.format(i, i+1) for i in range(14, 17) ]
overview_urls = []
for fp in player_lists_filepaths:
    f = open(fp, encoding='utf-8')
    players_page = BeautifulSoup(f.read(), 'html.parser')
    player_tags = players_page.findAll('a', {'class':'playerName'})
    more_urls = ['https:'+tag['href'] for tag in player_tags]
    overview_urls = overview_urls + more_urls

overview_urls = list(set(overview_urls))
players = {}
j = 0

for url in overview_urls:
        j += 1
        player_info = {}
        f_overview = urllib.request.urlopen(urllib.parse.quote(url, safe=':/', encoding='utf-8')).read()
        f_stats = urllib.request.urlopen(urllib.parse.quote(url.replace('overview','stats'), safe=':/', encoding='utf-8')).read()
        stats_page = BeautifulSoup(f_stats, 'html.parser')
        overview_page = BeautifulSoup(f_overview, 'html.parser')
        try:
            name = stats_page.find('div', {'class':'name'}).string
            stats_info = stats_page.findAll('span', {'class':'allStatContainer'})
            overview_info = overview_page.find('div', {'class':'personalLists'}).findAll('li')
            for stat in stats_info:
                try:
                    stat_value = stat.text.split()[0]
                    player_info[stat['class'][1].replace('stat', '')] = stat_value
                except AttributeError:
                    pass
            for li in overview_info:
                label = li.find('div', {'class':'label'}).string
                try: # for nationality
                    nationality = li.find('span', {'class':'playerCountry'}).string
                    player_info['Nation'] = nationality
                except AttributeError: # for player info that isn't nationality
                    info_value = li.find('div', {'class':'info'}).string
                    player_info[label] = info_value
            player_name = overview_page.find('div', {'class':'label'}, text='Position').next_sibling.next_sibling.string
            player_info['Position'] = player_name
            players[name] = player_info
        except AttributeError: # some players don't have a page :o
            pass
        if j % 100 == 0:
            print(j)
            try:
                prev_data = pd.read_csv('./data/epl/players.csv', index_col=0)
            except OSError:
                prev_data = pd.DataFrame()
            current_data = pd.DataFrame(players).transpose()
            all_data = pd.concat([current_data, prev_data])
            all_data.to_csv('./data/epl/players.csv', encoding='utf-8')
            players = {}

100
200
300
400
500
600
700
800
900
1000
1100
1200


In [106]:
url = overview_urls[2400]
player_info = {}
f_overview = urllib.request.urlopen(urllib.parse.quote(url, safe=':/', encoding='utf-8')).read()
f_stats = urllib.request.urlopen(urllib.parse.quote(url.replace('overview','stats'), safe=':/', encoding='utf-8')).read()
stats_page = BeautifulSoup(f_stats, 'html.parser')
overview_page = BeautifulSoup(f_overview, 'html.parser')
try:
    name = stats_page.find('div', {'class':'name'}).string
    stats_info = stats_page.findAll('span', {'class':'allStatContainer'})
    overview_info = overview_page.find('div', {'class':'personalLists'}).findAll('li')
    for stat in stats_info:
        try:
            stat_value = stat.text.split()[0]
            player_info[stat['class'][1].replace('stat', '')] = stat_value
        except AttributeError:
            pass
    for li in overview_info:
        label = li.find('div', {'class':'label'}).string
        try: # for nationality
            nationality = li.find('span', {'class':'playerCountry'}).string
            player_info['Nation'] = nationality
        except AttributeError: # for player info that isn't nationality
            info_value = li.find('div', {'class':'info'}).string
            player_info[label] = info_value
    player_name = overview_page.find('div', {'class':'label'}, text='Position').next_sibling.next_sibling.string
    player_info['Position'] = player_name
    players[name] = player_info
except AttributeError: # some players don't have a page :o
    pass

In [None]:
# code to retrieve all players information on EPL website (takes about 5 hours to run!)

import timeit
start_time = timeit.default_timer()

fp = 'https://www.premierleague.com/players/'
players = {}
player_strainer = SoupStrainer('div', {'class':'name'})
stat_strainer = SoupStrainer('span', {'class':'stat'})

for i in range(1, 100):
    old_page = BeautifulSoup(urllib.request.urlopen(fp+str(i)).read(), 'lxml', parse_only=player_strainer)
    try:
        player_name = old_page.text
        player_fp = fp+str(i)+'/'+player_name.replace(' ', '-')+'/stats'
        stats_page = BeautifulSoup(urllib.request.urlopen(player_fp).read(), 'lxml', parse_only=stat_strainer)
        all_stats = stats_page.findAll('span', {'class':'stat'})
        player_stats = {}
        for stat in all_stats:
            try:
                stat_value = re.findall('[-+]?\d*\.\d+|\d+', stat.span.string)[0]
                player_stats[stat.span['class'][1].replace('stat','')] = float(stat_value)
            except AttributeError:
                pass
        players[player_name] = player_stats
    except AttributeError:
        pass
    
elapsed = timeit.default_timer() - start_time