In [171]:
# -*- coding: utf-8 -*-
import urllib.request
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, SoupStrainer
import re
%matplotlib inline

In [246]:
players_df = pd.read_csv('./data/epl/players.csv', index_col=0) # Some position information is missing!

In [212]:
app_goals_players = players_df.ix[((players_df.goals > 1) | players_df.appearances == 1) 
                                              & (players_df.goals == players_df.appearances)].index
players_df.ix[app_goals_players, 'goals'] = 0
total_goals_players = players_df.ix[players_df.goals >= 1].index
players_df.ix[total_goals_players, 'goals'] = (
                                players_df.ix[total_goals_players, 'goals']/players_df.ix[total_goals_players, 'appearances'])
corrupt_players = list(players_df.ix[players_df.total_pass < 1].index.values)
empty_players = list(players_df.ix[players_df.appearances < 40].index.values)
wrong_players = list(players_df.ix[(players_df.goals_conceded+players_df.clean_sheet) < players_df.appearances].index.values)
players_df.drop(corrupt_players+empty_players+wrong_players, inplace=True)

*Import and clean players dataset*

*Verify dataset integrity*

*Feature exploration and extraction*

Traits of an ideal feature:
   - Is unique to a player: it would be the same regardless of the team they played for, or the teammates they played with.
   - Is normalised.
   - Is representative of a player's contribution to goals scored or conceded.
   - A larger value indicates a better player

Rating metrics: Goalkeeper
- Saves / (Saves + goals conceded) -> Fraction of shots saved
- Clean sheets / Appearances -> Success rate of goalkeeper

Rating metrics:
Defense
- won tackles (%)
- blocked shots per appearance
- interceptions per appearance
- clean sheets per appearance
- clearances per appearance
- duels won / total duels
- battles won / battles lost
- conceceded per appearance

Rating metrics: Offense
- Goals per appearance
- Assists per appearance
- Accurate crosses per appearance = (cross_accuracy x crosses)/appearances
- Accurate long balls per appearance

In [244]:
players_df

Unnamed: 0,Age,Date of Birth,Height,Nation,Position,Weight,accurate_cross,accurate_long_balls,aerial_lost,aerial_won,...,total_keeper_sweeper,total_offside,total_pass,total_scoring_att,total_tackle,total_through_ball,wins,won_contest,won_tackle,yellow_card
Abdoulaye Faye,,26/02/1978,,Senegal,,,36.0,237.0,125.0,284.0,...,,10.0,17.06,,248.0,2.0,56.0,21.0,81.0,32.0
Andrew Johnson,,10/02/1981,,England,,,,,,,...,,123.0,11.90,269.0,98.0,,65.0,,,14.0
Arouna Koné,33.0,11/11/1983,181cm,Cote D'Ivoire,Forward,78kg,,,,,...,,34.0,22.38,148.0,54.0,,26.0,,,3.0
Ashley Westwood,,01/04/1990,,England,Midfielder,,33.0,488.0,71.0,43.0,...,,0.0,46.81,92.0,203.0,13.0,29.0,18.0,74.0,19.0
Cameron Jerome,,14/08/1986,,England,Forward,,,,,,...,,128.0,15.10,327.0,138.0,,52.0,,,19.0
Chris Eagles,,19/11/1985,,England,Midfielder,,24.0,30.0,39.0,16.0,...,,4.0,23.01,162.0,92.0,32.0,17.0,81.0,82.0,4.0
Dean Whitehead,,21/01/1982,,England,Midfielder,,21.0,426.0,143.0,112.0,...,,7.0,27.48,84.0,416.0,10.0,61.0,26.0,71.0,50.0
Erik Nevland,,10/11/1977,,Norway,,,,,,,...,,26.0,9.96,51.0,38.0,,23.0,,,5.0
Frank Lampard,,20/06/1978,,England,,,30.0,1.0,85.0,64.0,...,,30.0,21.80,832.0,491.0,205.0,349.0,129.0,71.0,59.0
Jay Rodriguez,27.0,29/07/1989,185cm,England,Forward,80kg,,,,,...,,51.0,16.81,223.0,92.0,,30.0,,,7.0


In [240]:
def summarise_player(data):
    agg_data = pd.DataFrame()
    # concern - a player's stats are heavily influenced by who they play with
    # "we only hear see the bullet holes in planes that make it back"
    # defensive
    agg_data['save_rate'] = data['saves']/(data['saves'] + data['goals_conceded'])
    agg_data['cleans'] = data['clean_sheet']/data['appearances']
    agg_data['conceded'] = data['goals_conceded']/data['appearances']
    agg_data['tackles'] = data['won_tackle']/100
    # offensive
    agg_data['shots'] = data['ontarget_scoring_att']*data['total_scoring_att']/(100*data['appearances']) # on-target shots per appearance
    agg_data['scored'] = data['goals'] # goals per appearance
    agg_data['assists'] = data['goal_assist']/data['appearances'] # goal assists per appearance
    agg_data['crosses'] = data['accurate_cross']*data['total_cross']/(100*data['appearances']) # assume chance creations
    agg_data['appearances'] = data['appearances']
    return agg_data