In [41]:
import pandas as pd
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt
import datetime

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.cluster import KMeans
from gap_statistic import OptimalK


In [42]:
game_data = pd.read_csv("Data/game_log.csv")

In [43]:
game_data.describe()

Unnamed: 0,home_goals,away_goals
count,20736.0,20736.0
mean,1.65244,1.227768
std,1.277547,1.13436
min,0.0,0.0
25%,1.0,0.0
50%,1.0,1.0
75%,2.0,2.0
max,10.0,9.0


In [44]:
game_data.head(10)

Unnamed: 0,dateTime,League,home_goals,away_goals,home_team,away_team
0,2007-09-15 11:00:00,Premier League,0,1,Everton FC,Manchester United FC
1,2007-09-15 12:30:00,Premier League,1,3,Tottenham Hotspur FC,Arsenal FC
2,2007-09-15 14:00:00,Premier League,1,0,Birmingham City FC,Bolton Wanderers FC
3,2007-09-15 14:00:00,Premier League,2,0,Sunderland AFC,Reading FC
4,2007-09-15 14:00:00,Premier League,3,0,West Ham United FC,Middlesbrough FC
5,2007-09-15 16:15:00,Premier League,0,0,Chelsea FC,Blackburn Rovers FC
6,2007-09-16 15:00:00,Premier League,1,0,Manchester City FC,Aston Villa FC
7,2007-09-17 19:00:00,Premier League,1,0,Derby County FC,Newcastle United FC
8,2007-09-22 14:00:00,Premier League,2,2,Middlesbrough FC,Sunderland AFC
9,2007-09-22 14:00:00,Premier League,5,0,Arsenal FC,Derby County FC


In [45]:
def column_to_datetime(df, column):
    # Converts a column of a of a data frame to datetime
    df[column] = list(map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'),   df[column]))

In [46]:
# Given goals for home team and away team add a new column with home team result Win, Draw or Loss 1 for win, 0 for tie etc 
def get_result(goal_diff):
    if goal_diff > 0:
        return 'Win'
    elif goal_diff == 0:
        return 'Tie'
    else:
        return 'Loss'
    
    
def win_loss(df):
    difference = df['home_goals'] - df['away_goals']
    result = list(map(lambda x: get_result(x), difference))
    df['home_team_result'] = result
    return df 
    

In [47]:
column_to_datetime(game_data, 'dateTime')

In [48]:
game_data = win_loss(game_data)

In [49]:
game_data.head(20)

Unnamed: 0,dateTime,League,home_goals,away_goals,home_team,away_team,home_team_result
0,2007-09-15 11:00:00,Premier League,0,1,Everton FC,Manchester United FC,Loss
1,2007-09-15 12:30:00,Premier League,1,3,Tottenham Hotspur FC,Arsenal FC,Loss
2,2007-09-15 14:00:00,Premier League,1,0,Birmingham City FC,Bolton Wanderers FC,Win
3,2007-09-15 14:00:00,Premier League,2,0,Sunderland AFC,Reading FC,Win
4,2007-09-15 14:00:00,Premier League,3,0,West Ham United FC,Middlesbrough FC,Win
5,2007-09-15 16:15:00,Premier League,0,0,Chelsea FC,Blackburn Rovers FC,Tie
6,2007-09-16 15:00:00,Premier League,1,0,Manchester City FC,Aston Villa FC,Win
7,2007-09-17 19:00:00,Premier League,1,0,Derby County FC,Newcastle United FC,Win
8,2007-09-22 14:00:00,Premier League,2,2,Middlesbrough FC,Sunderland AFC,Tie
9,2007-09-22 14:00:00,Premier League,5,0,Arsenal FC,Derby County FC,Win


In [50]:
# Output number of games for each team 
for league in game_data['League'].unique():
    print(league)
    print(sum(game_data['League'] == league))
    
# Convert Outputs to pandas table 

Premier League
4305
Primera Division
4380
Bundesliga
3527
Serie A
4353
Ligue 1
4171


In [51]:
# Print Number of teams in Dataset for each league 
for league in game_data['League'].unique():
    print(league)
    print(len(game_data[game_data['League'] == league]['home_team'].unique()))

Premier League
38
Primera Division
38
Bundesliga
34
Serie A
35
Ligue 1
38


In [52]:
games = game_data[game_data['League'] == 'Premier League']['home_team'].value_counts() #.groupby(['away_team']).count()
games

Manchester City FC            225
Chelsea FC                    224
Manchester United FC          221
Liverpool FC                  219
Everton FC                    219
Tottenham Hotspur FC          217
Arsenal FC                    215
West Ham United FC            201
Newcastle United FC           183
Stoke City FC                 176
Sunderland AFC                172
West Bromwich Albion FC       159
Aston Villa FC                156
Fulham FC                     142
Southampton FC                128
Swansea City AFC              124
Crystal Palace FC             110
Burnley FC                     95
Wigan Athletic FC              94
Leicester City FC              93
Hull City AFC                  87
Bolton Wanderers FC            85
Blackburn Rovers FC            83
Wolverhampton Wanderers FC     78
AFC Bournemouth                77
Norwich City FC                75
Watford FC                     72
Queens Park Rangers FC         51
Birmingham City FC             50
Middlesbrough 

Top teams have many more games played 

In [53]:
game_data[game_data['League'] == 'Premier League']['away_team'].value_counts()

Arsenal FC                    223
Tottenham Hotspur FC          219
Manchester United FC          217
Chelsea FC                    215
Manchester City FC            214
Liverpool FC                  211
Everton FC                    208
West Ham United FC            199
Newcastle United FC           182
Stoke City FC                 175
Sunderland AFC                174
Aston Villa FC                163
West Bromwich Albion FC       159
Fulham FC                     131
Southampton FC                128
Swansea City AFC              125
Crystal Palace FC             112
Wigan Athletic FC             103
Leicester City FC              97
Burnley FC                     94
Hull City AFC                  89
Bolton Wanderers FC            87
Blackburn Rovers FC            85
Watford FC                     79
Wolverhampton Wanderers FC     79
AFC Bournemouth                78
Norwich City FC                75
Queens Park Rangers FC         55
Birmingham City FC             51
Middlesbrough 

In [54]:
game_data['dateTime'][-1:]

20735   2019-11-10 20:00:00
Name: dateTime, dtype: datetime64[ns]

Games up to Novermber 10, 2019

In [55]:
# Get all games for a team 
game_data[(game_data['home_team']== 'Manchester United FC') | (game_data['away_team'] == 'Manchester United FC')]

Unnamed: 0,dateTime,League,home_goals,away_goals,home_team,away_team,home_team_result
0,2007-09-15 11:00:00,Premier League,0,1,Everton FC,Manchester United FC,Loss
15,2007-09-23 15:00:00,Premier League,2,0,Manchester United FC,Chelsea FC,Win
20,2007-09-29 16:15:00,Premier League,0,1,Birmingham City FC,Manchester United FC,Loss
26,2007-10-06 11:45:00,Premier League,4,0,Manchester United FC,Wigan Athletic FC,Win
41,2007-10-20 16:15:00,Premier League,1,4,Aston Villa FC,Manchester United FC,Loss
...,...,...,...,...,...,...,...
4266,2019-10-06 15:30:00,Premier League,1,0,Newcastle United FC,Manchester United FC,Win
4274,2019-10-20 15:30:00,Premier League,1,1,Manchester United FC,Liverpool FC,Tie
4284,2019-10-27 16:30:00,Premier League,1,3,Norwich City FC,Manchester United FC,Loss
4285,2019-11-02 12:30:00,Premier League,1,0,AFC Bournemouth,Manchester United FC,Win


In [94]:
def get_team_games(df, team_name, start, end):
    games = game_data[(game_data['home_team']== team_name) | (game_data['away_team'] == team_name)]
    games = games[games['dateTime'] >= start]
    games = games[games['dateTime'] <= end]
    return games

In [95]:
# Absolute endpoints of time period
START = min(game_data['dateTime'])
END = max(game_data['dateTime'])

In [103]:
team_name = "Arsenal FC"
arsenal_games = get_team_games(game_data, team_name, START, END)
arsenal_games.head(10)

Unnamed: 0,dateTime,League,home_goals,away_goals,home_team,away_team,home_team_result
1,2007-09-15 12:30:00,Premier League,1,3,Tottenham Hotspur FC,Arsenal FC,Loss
9,2007-09-22 14:00:00,Premier League,5,0,Arsenal FC,Derby County FC,Win
18,2007-09-29 14:00:00,Premier League,0,1,West Ham United FC,Arsenal FC,Loss
28,2007-10-07 11:00:00,Premier League,3,2,Arsenal FC,Sunderland AFC,Win
40,2007-10-20 14:00:00,Premier League,2,0,Arsenal FC,Bolton Wanderers FC,Win
52,2007-10-28 16:00:00,Premier League,1,1,Liverpool FC,Arsenal FC,Tie
53,2007-11-03 12:45:00,Premier League,2,2,Arsenal FC,Manchester United FC,Tie
62,2007-11-12 20:00:00,Premier League,1,3,Reading FC,Arsenal FC,Loss
74,2007-11-24 15:00:00,Premier League,2,0,Arsenal FC,Wigan Athletic FC,Win
84,2007-12-01 17:15:00,Premier League,1,2,Aston Villa FC,Arsenal FC,Loss


In [99]:
# Get goal scoring stats for a team during a certain time period
def get_scoring_statistics(df, team_name, start, end):
    team_games = get_team_games(df, team_name, start, end)
    home_goals = team_games[team_games['home_team'] == team_name]['home_goals'].mean()
    away_goals = team_games[team_games['away_team'] == team_name]['away_goals'].mean()
    home_conceded = team_games[team_games['away_team'] != team_name]['away_goals'].mean()
    away_conceded = team_games[team_games['home_team'] != team_name]['home_goals'].mean()
    print("Goals Scored at Home: {0:.2f}".format(home_goals))
    print("Goals Scored Away: {0:.2f}".format(away_goals))
    print("Goals Conceded at Home: {0:.2f}".format(home_conceded))
    print("Goals Conceded Away: {0:.2f}".format(away_conceded))

In [100]:
get_scoring_statistics(game_data, "Manchester United FC", START, END)

Goals Scored at Home: 2.22
Goals Scored Away: 1.70
Goals Conceded at Home: 0.81
Goals Conceded Away: 1.12


In [101]:
get_scoring_statistics(game_data, "Manchester City FC", START, END)

Goals Scored at Home: 2.51
Goals Scored Away: 1.85
Goals Conceded at Home: 0.86
Goals Conceded Away: 1.25


In [102]:
# Look at scoring trends over last 30 days 
start_day  = END - datetime.timedelta(days = 30)
get_scoring_statistics(game_data, "Manchester City FC", start_day, END)

Goals Scored at Home: 2.50
Goals Scored Away: 1.50
Goals Conceded at Home: 0.50
Goals Conceded Away: 1.50
