In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import json
from pandas.io.json import json_normalize

In [2]:
def get_team_soup(team):
    # grabs the url for a particular team and returns a Beautifulsoup object containing player data for a that team
    xg_url = 'https://understat.com/team/' + team
    
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    driver = webdriver.Chrome(options=options)
    driver.get(xg_url)
    team_soup = BeautifulSoup(driver.page_source, 'lxml')
    
    return team_soup

In [3]:
def get_EPL_soup():
    # grabs the url for the EPL and returns a Beautifulsoup object for each team(not player data) in the league
    xg_url = 'https://understat.com/league/EPL'
    
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    driver = webdriver.Chrome(options=options)
    driver.get(xg_url)
    EPL_soup = BeautifulSoup(driver.page_source, 'lxml')
    
    return EPL_soup

In [4]:
def xG_EPL_Scraper(soup):
    # Returns a dataframe containing team xg data for all teams in the EPL
    
    headers = soup.find('div', attrs={'class':'chemp margin-top jTable'}).find('table').find_all('th',attrs={'class':'sort'})
    headers_list = []
    
    for header in headers:
        headers_list.append(header.get_text(strip=True))
    
    body = soup.find('div', attrs={'class':'chemp margin-top jTable'}).table.tbody
    all_rows_list = []

    for tr in body.find_all('tr'):
        row = tr.find_all('td')
        current_row = []
        for item in row:
            current_row.append(item.get_text(strip=True))
        all_rows_list.append(current_row)

    xg_df = pd.DataFrame(all_rows_list, columns = headers_list)    
    
    return xg_df

In [5]:
def xG_Team_Scraper(soup):
    # Returns a dataframe containing a particlar team's players' xg data
   
    headers = soup.find('div', attrs={'class':'players margin-top jTable'}).find('table').find_all('th',attrs={'class':'sort'})
    header_list = []

    for header in headers:
        header_list.append(header.get_text(strip = True))

    header_list = ['№', 'Player','Pos','Apps', 'Min', 'G', 'A', 'Sh90', 'KP90', 'xG', 'xA', 'xG90', 'xA90']
    
    body = soup.find('div', attrs={'class':'players margin-top jTable'}).find('table').find('tbody')
    all_rows_list = []

    for tr in body.find_all('tr'):
        row = tr.find_all('td')
        current_row = []
        for item in row:
            current_row.append(item.get_text(strip = True))
        all_rows_list.append(current_row)
    
    team_xg = pd.DataFrame(all_rows_list, columns = header_list)
    
   
    return team_xg

In [6]:
def get_all_player_xg(teams):
    #Returns a dataframe of all player stats in EPL
    
    df = pd.DataFrame()
    
    for team in teams:
        team_soup = get_team_soup(team)
        team_df = xG_Team_Scraper(team_soup)

        team_df.insert(1,column = 'Team', value = team)

        df = pd.concat([df, team_df], ignore_index=True)

        # Use timer to avoid requesting data from understat too frequently
        time.sleep(3)
            
        print(f"{team}: Done")
    
    return df

In [7]:
def get_json(file_path):
    # Get json file from fanatsy.premierleague
    r = requests.get('https://fantasy.premierleague.com/drf/bootstrap')
    jsonResponse = r.json()
    with open(file_path, 'w') as outfile:
        json.dump(jsonResponse, outfile)

In [8]:
def get_fixtures_EPL(epl_players):
    # Updates the EPL player dataframe with upcoming fixture information
    
    get_json(r"C:\Users\TinyPoots\OneDrive\Python\FPL\fpl.json")
    
    with open(r"C:\Users\TinyPoots\OneDrive\Python\FPL\fpl.json") as json_data:
        d = json.load(json_data)
    
    # normalize json in to flat table
    df = json_normalize(d['teams'])
    id_df = df[['id','name']]
    
    # Several of the names on from Fantasy.premierleague are different from the name used on Understat
    id_df['name'].replace({'Wolves':'Wolverhampton Wanderers','Newcastle':'Newcastle United','Man City':'Manchester City', 
                                   'Man Utd':'Manchester United', 'Spurs':'Tottenham'}, inplace = True)
    
    team_id = dict(zip(list(id_df['id']),list(id_df['name'])))
    
    df = json_normalize(d['next_event_fixtures'])
    fixtures_df = df[['event','team_h','team_a']]
    
    # Change team names to the ones used in Understat data
    fixtures_df['team_a'] = fixtures_df['team_a'].map(team_id)
    fixtures_df['team_h'] = fixtures_df['team_h'].map(team_id)
    
    home_team = list(fixtures_df['team_h'])
    away_team = list(fixtures_df['team_a'])
    fixture_dict = dict(zip(home_team, away_team))
    fixture_dict.update(dict(zip(away_team, home_team)))
    
    # Add upcoming fixture data to the players dataframe
    epl_players['Next Opp'] = epl_players['Team'].map(fixture_dict)
    epl_players['Home'] = epl_players['Next Opp'].apply(lambda x: True if (x in away_team) else False)
    
    return epl_players

**Use functions to get team and player data**

Get team data

In [None]:
# Get xG data for every team in the EPL
epl_df = xG_EPL_Scraper(get_EPL_soup())
# Save for later use
epl_df.to_csv('xg_EPL.csv')

Get player data (take about 5 minutes)

In [10]:
# Get xG data for every player in the EPL
team_names = list(epl_df['Team'])
player_df = get_all_player_xg(team_names)

Liverpool: Done
Manchester City: Done
Tottenham: Done
Arsenal: Done
Manchester United: Done
Chelsea: Done
Wolverhampton Wanderers: Done
Watford: Done
West Ham: Done
Leicester: Done
Everton: Done
Bournemouth: Done
Newcastle United: Done
Crystal Palace: Done
Brighton: Done
Southampton: Done
Burnley: Done
Cardiff: Done
Fulham: Done
Huddersfield: Done


In [11]:
# Add upcoming fixture information
players_df_fix = get_fixtures_EPL(player_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
players_df_fix.to_csv('xg_Players.csv')