In [1]:
## Importing required libraries
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from time import time
from sklearn.decomposition import PCA, FastICA
from sklearn.pipeline import Pipeline
import matplotlib
matplotlib.rcParams['figure.figsize'] = [20, 12]

In [92]:
def get_match_label(match):
    ''' Derives a label for a given match. '''
    
    #Define variables
    home_goals = match['home_team_goal']
    away_goals = match['away_team_goal']

    #Identify match label  
    if home_goals > away_goals:
        return 'Win'
    if home_goals == away_goals:
        return 'Draw'
    if home_goals < away_goals:
        return 'Defeat'

In [83]:
def get_last_matches(matches, date, team, x = 10):
    ''' Get the last x matches of a given team. '''
    
    #Filter team matches from matches
    team_matches = matches[(matches['home_team_api_id'] == team) | (matches['away_team_api_id'] == team)]
                           
    #Filter x last matches from team matches
    last_matches = team_matches[team_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
    
    #Return last matches
    return last_matches

In [84]:
def get_last_matches_against_eachother(matches, date, home_team, away_team, x = 10):
    ''' Get the last x matches of two given teams. '''
    
    #Find matches of both teams
    home_matches = matches[(matches['home_team_api_id'] == home_team) & (matches['away_team_api_id'] == away_team)]    
    away_matches = matches[(matches['home_team_api_id'] == away_team) & (matches['away_team_api_id'] == home_team)]  
    total_matches = pd.concat([home_matches, away_matches])
    
    #Get last x matches
    try:    
        last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
    except:
        last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:total_matches.shape[0],:]
        
        #Check for error in data
        if(last_matches.shape[0] > x):
            print("Error in obtaining matches")
            
    #Return data
    return last_matches

In [85]:
def convert_odds_to_prob(match_odds):
    ''' Converts bookkeeper odds to probabilities. '''
    
    #Define variables
    match_id = match_odds.loc[:,'match_api_id']
    bookkeeper = match_odds.loc[:,'bookkeeper']    
    win_odd = match_odds.loc[:,'Win']
    draw_odd = match_odds.loc[:,'Draw']
    loss_odd = match_odds.loc[:,'Defeat']
    
    #Converts odds to prob
    win_prob = 1 / win_odd
    draw_prob = 1 / draw_odd
    loss_prob = 1 / loss_odd
    
    total_prob = win_prob + draw_prob + loss_prob
    
    probs = pd.DataFrame()
    
    #Define output format and scale probs by sum over all probs
    probs.loc[:,'match_api_id'] = match_id
    probs.loc[:,'bookkeeper'] = bookkeeper
    probs.loc[:,'Win'] = win_prob / total_prob
    probs.loc[:,'Draw'] = draw_prob / total_prob
    probs.loc[:,'Defeat'] = loss_prob / total_prob
    
    #Return probs and meta data
    return probs

### Doing things now...

In [2]:
## Fetching data
#Connecting to database
path = "soccer_database.sqlite"  #Insert path here
conn = sqlite3.connect(path)

#Defining the number of jobs to be run in parallel during grid search
n_jobs = 2 #Insert number of parallel jobs here

#Fetching required data tables
player_data = pd.read_sql("SELECT * FROM Player;", conn)
player_stats_data = pd.read_sql("SELECT * FROM Player_Attributes;", conn)
team_data = pd.read_sql("SELECT * FROM Team;", conn)
match_data = pd.read_sql("SELECT * FROM Match where country_id = 1729;", conn)

match_data.head(5)

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,...,10.0,1.28,5.5,12.0,1.3,4.75,10.0,1.29,4.5,11.0
1,1730,1729,1729,2008/2009,1,2008-08-16 00:00:00,489043,9825,8659,1,...,12.0,1.25,6.0,13.0,1.22,5.5,13.0,1.22,5.0,13.0
2,1731,1729,1729,2008/2009,1,2008-08-16 00:00:00,489044,8472,8650,0,...,1.73,5.5,3.8,1.65,5.0,3.4,1.7,4.5,3.4,1.73
3,1732,1729,1729,2008/2009,1,2008-08-16 00:00:00,489045,8654,8528,2,...,3.75,1.9,3.5,4.35,1.91,3.25,4.0,1.91,3.25,3.8
4,1733,1729,1729,2008/2009,1,2008-08-17 00:00:00,489046,10252,8456,4,...,3.75,1.9,3.5,4.35,1.91,3.25,4.0,1.91,3.3,3.75


In [3]:
team_data.head(5)

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB


In [4]:
player_data.head(5)

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146
2,3,162549,Aaron Doran,186170,1991-05-13 00:00:00,170.18,163
3,4,30572,Aaron Galindo,140161,1982-05-08 00:00:00,182.88,198
4,5,23780,Aaron Hughes,17725,1979-11-08 00:00:00,182.88,154


In [5]:
with_team_names = match_data.merge(
    team_data, 
    how='inner', 
    left_on=['home_team_api_id'], 
    right_on=['team_api_id'], 
    suffixes=('', '_home')
).merge(
    team_data, 
    how='inner', 
    left_on=['away_team_api_id'], 
    right_on=['team_api_id'], 
    suffixes=('', '_away')
)
with_team_names.head(5)

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,id_home,team_api_id,team_fifa_api_id,team_long_name,team_short_name,id_away,team_api_id_away,team_fifa_api_id_away,team_long_name_away,team_short_name_away
0,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,...,3457,10260,11.0,Manchester United,MUN,3458,10261,13.0,Newcastle United,NEW
1,2494,1729,1729,2010/2011,1,2010-08-16 00:00:00,839802,10260,10261,3,...,3457,10260,11.0,Manchester United,MUN,3458,10261,13.0,Newcastle United,NEW
2,2913,1729,1729,2011/2012,13,2011-11-26 00:00:00,1024845,10260,10261,1,...,3457,10260,11.0,Manchester United,MUN,3458,10261,13.0,Newcastle United,NEW
3,3353,1729,1729,2012/2013,19,2012-12-26 00:00:00,1229300,10260,10261,4,...,3457,10260,11.0,Manchester United,MUN,3458,10261,13.0,Newcastle United,NEW
4,3693,1729,1729,2013/2014,15,2013-12-07 00:00:00,1474518,10260,10261,0,...,3457,10260,11.0,Manchester United,MUN,3458,10261,13.0,Newcastle United,NEW


In [17]:
import operator
import xml.etree as ET

def get_posession_things(data):
    last_value = 50
    try:
        tree = ET.fromstring(data)
        results = tree.findall('value')
        results_dict = dict((result.find('elapsed').text, result.find('homepos').text) for result in results)
        last_value = [value for key, value in sorted(results_dict.items(), key=operator.itemgetter(1), reverse=True)][0]
    except:
        pass
    
    return last_value

In [18]:
with_team_names.loc[:, 'home_possession'] = with_team_names.possession.apply(lambda x: get_posession_things(x))

In [19]:
with_team_names.loc[:, 'total_prob'] = 1./with_team_names.B365H + 1./with_team_names.B365D + 1./with_team_names.B365A
with_team_names.loc[:, 'pct_win_prob'] = (1./ with_team_names.B365H) / with_team_names.total_prob
with_team_names.head(5)

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,team_long_name,team_short_name,id_away,team_api_id_away,team_fifa_api_id_away,team_long_name_away,team_short_name_away,home_possession,total_prob,pct_win_prob
0,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,...,Manchester United,MUN,3458,10261,13.0,Newcastle United,NEW,50,1.047921,0.739744
1,2494,1729,1729,2010/2011,1,2010-08-16 00:00:00,839802,10260,10261,3,...,Manchester United,MUN,3458,10261,13.0,Newcastle United,NEW,50,1.048485,0.763006
2,2913,1729,1729,2011/2012,13,2011-11-26 00:00:00,1024845,10260,10261,1,...,Manchester United,MUN,3458,10261,13.0,Newcastle United,NEW,50,1.054155,0.677591
3,3353,1729,1729,2012/2013,19,2012-12-26 00:00:00,1229300,10260,10261,4,...,Manchester United,MUN,3458,10261,13.0,Newcastle United,NEW,50,1.050441,0.780312
4,3693,1729,1729,2013/2014,15,2013-12-07 00:00:00,1474518,10260,10261,0,...,Manchester United,MUN,3458,10261,13.0,Newcastle United,NEW,50,1.027399,0.636165


## Work out how good each team is

In [60]:
HISTORY = 10
k_factor = 32
elo_width = 400.

def calculate_new_elos(rating_a, rating_b, score_a, k_factor, elo_width):
    """Calculates and returns the new Elo ratings for two players.
    score_a is 1 for a win by player A, 0 for a loss by player A, or 0.5 for a draw.
    """

    e_a = expected_result(rating_a, rating_b, elo_width)
    e_b = 1. - e_a
    new_rating_a = rating_a + k_factor * (score_a - e_a)
    score_b = 1. - score_a
    new_rating_b = rating_b + k_factor * (score_b - e_b)
    return new_rating_a, new_rating_b

def expected_result(elo_a, elo_b, elo_width):
    """
    https://en.wikipedia.org/wiki/Elo_rating_system#Mathematical_details
    """
    expect_a = 1.0/(1+10**((elo_b - elo_a)/elo_width))
    return expect_a

def update_end_of_season(elos):
    """Regression towards the mean
    
    Following 538 nfl methods
    https://fivethirtyeight.com/datalab/nfl-elo-ratings-are-back/
    """
    diff_from_mean = elos - np.mean(elos)
    elos -= diff_from_mean/3
    return elos

In [77]:
team_elo = with_team_names.team_long_name.drop_duplicates().reset_index()
team_elo.loc[:, 'elo'] = 1000
team_elo = team_elo[['team_long_name', 'elo']]
with_team_names.loc[:, 'other_expected'] = 0.5
team_elo.head()

Unnamed: 0,team_long_name,elo
0,Manchester United,1000
1,Arsenal,1000
2,Sunderland,1000
3,West Ham United,1000
4,Aston Villa,1000


In [28]:
with_team_names.sort_values(by='date').head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,team_long_name,team_short_name,id_away,team_api_id_away,team_fifa_api_id_away,team_long_name_away,team_short_name_away,home_possession,total_prob,pct_win_prob
1370,1737,1729,1729,2008/2009,1,2008-08-16 00:00:00,489050,8667,9879,2,...,Hull City,HUL,3474,9879,144.0,Fulham,FUL,50,1.054258,0.364821
661,1732,1729,1729,2008/2009,1,2008-08-16 00:00:00,489045,8654,8528,2,...,West Ham United,WHU,3464,8528,1917.0,Wigan Athletic,WIG,50,1.055773,0.495902
1194,1731,1729,1729,2008/2009,1,2008-08-16 00:00:00,489044,8472,8650,0,...,Sunderland,SUN,3462,8650,9.0,Liverpool,LIV,50,1.058398,0.171786
1643,1735,1729,1729,2008/2009,1,2008-08-16 00:00:00,489048,8549,8586,2,...,Middlesbrough,MID,3470,8586,18.0,Tottenham Hotspur,TOT,50,1.051062,0.297318
1083,1734,1729,1729,2008/2009,1,2008-08-16 00:00:00,489047,8668,8655,2,...,Everton,EVE,3468,8655,3.0,Blackburn Rovers,BLB,50,1.05303,0.47482


In [29]:
def calculate_result(row):
    if row.home_team_goal > row.away_team_goal:
        return 1
    elif row.home_team_goal == row.away_team_goal:
        return 0.5
    else:
        return 0

In [70]:
def create_predictions(team_elo, col_name, k_factor, elo_width):
    current_season = '2008/2009'
    for index, row in with_team_names.sort_values(by='date').iterrows():
        new_season = row.season
        if new_season != current_season:
            new_elos = update_end_of_season(team_elo.elo.values)
            team_elo.loc[:, 'elo'] = new_elos
            current_season = new_season

        home = row.team_long_name
        away = row.team_long_name_away
        home_elo = team_elo[team_elo.team_long_name == home].elo.values[0]
        away_elo = team_elo[team_elo.team_long_name == away].elo.values[0]
        expected_results = expected_result(home_elo, away_elo, elo_width)
        result = calculate_result(row)
        with_team_names.loc[index, col_name] = expected_results
        new_home_elo, new_away_elo = calculate_new_elos(home_elo, away_elo, result, k_factor, elo_width)
        team_elo.loc[team_elo.team_long_name == home, 'elo'] = new_home_elo
        team_elo.loc[team_elo.team_long_name == away, 'elo'] = new_away_elo

In [78]:
create_predictions(team_elo, 'other_expected', 64, 200)

In [79]:
team_elo.sort_values(by='elo', ascending=False).head()

Unnamed: 0,team_long_name,elo
27,Southampton,1134.317762
30,Leicester City,1128.367028
25,Swansea City,1079.377265
0,Manchester United,1068.527899
1,Arsenal,1065.703243


In [67]:
with_team_names.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,team_api_id_away,team_fifa_api_id_away,team_long_name_away,team_short_name_away,home_possession,total_prob,pct_win_prob,expected,resulty,other_expected
0,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,...,10261,13.0,Newcastle United,NEW,50,1.047921,0.739744,0.5,0.5,0.5
1,2494,1729,1729,2010/2011,1,2010-08-16 00:00:00,839802,10260,10261,3,...,10261,13.0,Newcastle United,NEW,50,1.048485,0.763006,0.744386,1.0,0.744386
2,2913,1729,1729,2011/2012,13,2011-11-26 00:00:00,1024845,10260,10261,1,...,10261,13.0,Newcastle United,NEW,50,1.054155,0.677591,0.648628,0.5,0.648628
3,3353,1729,1729,2012/2013,19,2012-12-26 00:00:00,1229300,10260,10261,4,...,10261,13.0,Newcastle United,NEW,50,1.050441,0.780312,0.779633,1.0,0.779633
4,3693,1729,1729,2013/2014,15,2013-12-07 00:00:00,1474518,10260,10261,0,...,10261,13.0,Newcastle United,NEW,50,1.027399,0.636165,0.629053,0.0,0.629053


In [80]:
with_team_names[with_team_names.season == '2012/2013']

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,team_fifa_api_id_away,team_long_name_away,team_short_name_away,home_possession,total_prob,pct_win_prob,expected,resulty,other_expected,final_expected
3,3353,1729,1729,2012/2013,19,2012-12-26 00:00:00,1229300,10260,10261,4,...,13.0,Newcastle United,NEW,50,1.050441,0.780312,0.779633,1.0,0.904709,0.668747
10,3369,1729,1729,2012/2013,20,2012-12-29 00:00:00,1229306,9825,10261,7,...,13.0,Newcastle United,NEW,50,1.059524,0.655431,0.654674,1.0,0.785690,0.586200
17,3614,1729,1729,2012/2013,8,2012-10-21 00:00:00,1229179,8472,10261,1,...,13.0,Newcastle United,NEW,50,1.050253,0.380861,0.431984,0.5,0.493696,0.456731
23,3548,1729,1729,2012/2013,36,2013-05-04 00:00:00,1229492,8654,10261,0,...,13.0,Newcastle United,NEW,50,1.021978,0.411132,0.543230,0.5,0.752104,0.487306
30,3410,1729,1729,2012/2013,24,2013-01-29 00:00:00,1229351,10252,10261,1,...,13.0,Newcastle United,NEW,50,1.023561,0.336890,0.485101,0.0,0.583613,0.471259
37,3571,1729,1729,2012/2013,4,2012-09-17 00:00:00,1228331,8668,10261,2,...,13.0,Newcastle United,NEW,50,1.055812,0.547479,0.534850,0.5,0.634882,0.515422
51,3360,1729,1729,2012/2013,2,2012-08-25 00:00:00,1228291,8455,10261,2,...,13.0,Newcastle United,NEW,50,1.051460,0.634039,0.520867,1.0,0.536452,0.543280
58,3302,1729,1729,2012/2013,14,2012-11-28 00:00:00,1229242,10194,10261,2,...,13.0,Newcastle United,NEW,50,1.055773,0.495902,0.510324,1.0,0.751507,0.478501
65,3260,1729,1729,2012/2013,10,2012-11-04 00:00:00,1229197,8650,10261,1,...,13.0,Newcastle United,NEW,50,1.053609,0.604534,0.421464,0.5,0.438400,0.477118
72,3322,1729,1729,2012/2013,16,2012-12-10 00:00:00,1229265,9879,10261,2,...,13.0,Newcastle United,NEW,50,1.051338,0.497994,0.496220,1.0,0.478215,0.488579


In [51]:
with_team_names.loc[:, 'resulty'] = with_team_names.apply(lambda x: calculate_result(x), axis=1)
with_team_names.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,id_away,team_api_id_away,team_fifa_api_id_away,team_long_name_away,team_short_name_away,home_possession,total_prob,pct_win_prob,expected,resulty
0,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,...,3458,10261,13.0,Newcastle United,NEW,50,1.047921,0.739744,0.5,0.5
1,2494,1729,1729,2010/2011,1,2010-08-16 00:00:00,839802,10260,10261,3,...,3458,10261,13.0,Newcastle United,NEW,50,1.048485,0.763006,0.744386,1.0
2,2913,1729,1729,2011/2012,13,2011-11-26 00:00:00,1024845,10260,10261,1,...,3458,10261,13.0,Newcastle United,NEW,50,1.054155,0.677591,0.648628,0.5
3,3353,1729,1729,2012/2013,19,2012-12-26 00:00:00,1229300,10260,10261,4,...,3458,10261,13.0,Newcastle United,NEW,50,1.050441,0.780312,0.779633,1.0
4,3693,1729,1729,2013/2014,15,2013-12-07 00:00:00,1474518,10260,10261,0,...,3458,10261,13.0,Newcastle United,NEW,50,1.027399,0.636165,0.629053,0.0


In [74]:
from sklearn.metrics import mean_squared_error

mean_squared_error(with_team_names.resulty, with_team_names.pct_win_prob)

0.16656785679413874

In [75]:
mean_squared_error(with_team_names.resulty, with_team_names.expected)

0.16245081868874653

In [76]:
mean_squared_error(with_team_names.resulty, with_team_names.final_expected)

0.16546206236139085

In [81]:
mean_squared_error(with_team_names.resulty, with_team_names.other_expected)

0.18089134019391082

In [93]:
with_team_names.loc[:, 'label'] = with_team_names.apply(lambda x: get_match_label(x), axis=1)
with_team_names.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,team_long_name_away,team_short_name_away,home_possession,total_prob,pct_win_prob,expected,resulty,other_expected,final_expected,label
0,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,...,Newcastle United,NEW,50,1.047921,0.739744,0.5,0.5,0.5,0.5,Draw
1,2494,1729,1729,2010/2011,1,2010-08-16 00:00:00,839802,10260,10261,3,...,Newcastle United,NEW,50,1.048485,0.763006,0.744386,1.0,0.863552,0.638373,Win
2,2913,1729,1729,2011/2012,13,2011-11-26 00:00:00,1024845,10260,10261,1,...,Newcastle United,NEW,50,1.054155,0.677591,0.648628,0.5,0.616338,0.62591,Draw
3,3353,1729,1729,2012/2013,19,2012-12-26 00:00:00,1229300,10260,10261,4,...,Newcastle United,NEW,50,1.050441,0.780312,0.779633,1.0,0.904709,0.668747,Win
4,3693,1729,1729,2013/2014,15,2013-12-07 00:00:00,1474518,10260,10261,0,...,Newcastle United,NEW,50,1.027399,0.636165,0.629053,0.0,0.466635,0.626356,Defeat


In [96]:
import datetime
get_last_matches_against_eachother(with_team_names, '2013-07-01', 10260, 10261, x = 25)

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,team_long_name_away,team_short_name_away,home_possession,total_prob,pct_win_prob,expected,resulty,other_expected,final_expected,label
3,3353,1729,1729,2012/2013,19,2012-12-26 00:00:00,1229300,10260,10261,4,...,Newcastle United,NEW,50,1.050441,0.780312,0.779633,1.0,0.904709,0.668747,Win
2963,3602,1729,1729,2012/2013,7,2012-10-07 00:00:00,1229165,10261,10260,0,...,Manchester United,MUN,50,1.05121,0.226496,0.374462,0.0,0.357479,0.39682,Defeat
2962,2994,1729,1729,2011/2012,20,2012-01-04 00:00:00,1025212,10261,10260,3,...,Manchester United,MUN,50,1.055812,0.189428,0.27815,1.0,0.162293,0.345183,Win
2,2913,1729,1729,2011/2012,13,2011-11-26 00:00:00,1024845,10260,10261,1,...,Newcastle United,NEW,50,1.054155,0.677591,0.648628,0.5,0.616338,0.62591,Draw
2961,2753,1729,1729,2010/2011,33,2011-04-19 00:00:00,840178,10261,10260,0,...,Manchester United,MUN,50,1.058398,0.171786,0.220524,0.5,0.142241,0.305839,Draw
1,2494,1729,1729,2010/2011,1,2010-08-16 00:00:00,839802,10260,10261,3,...,Newcastle United,NEW,50,1.048485,0.763006,0.744386,1.0,0.863552,0.638373,Win
2960,1935,1729,1729,2008/2009,28,2009-03-04 00:00:00,489319,10261,10260,1,...,Manchester United,MUN,50,1.05,0.126984,0.20518,0.0,0.059601,0.365876,Defeat
0,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,...,Newcastle United,NEW,50,1.047921,0.739744,0.5,0.5,0.5,0.5,Draw


In [151]:
def get_home_recent_score(match, all_matches):
    last_matches = get_last_matches(
        all_matches,
        match.date,
        match.home_team_api_id,
        x = 10
    )
    score = calculate_team_score(last_matches, match)
    return score

def get_away_recent_score(match, all_matches):
    last_matches = get_last_matches(
        all_matches,
        match.date,
        match.away_team_api_id,
        x = 10
    )
    score = calculate_away_team_score(last_matches, match)
    return score

def get_team_score(match, all_matches):
    last_matches = get_last_matches_against_eachother(
        all_matches,
        match.date,
        match.home_team_api_id,
        match.away_team_api_id,
        x = 25
    )
    score = calculate_team_score(last_matches, match)
    return score

def calculate_team_score(matches, this_match):
    home_matches = matches[matches.home_team_api_id == this_match.home_team_api_id]
    away_matches = matches[matches.away_team_api_id == this_match.home_team_api_id]
    home_offensive_score, home_defensive_score = get_home_score(home_matches)
    away_offensive_score, away_defensive_score = get_away_score(away_matches)
    return (home_offensive_score, home_defensive_score, away_offensive_score, away_defensive_score)

def calculate_away_team_score(matches, this_match):
    home_matches = matches[matches.home_team_api_id == this_match.away_team_api_id]
    away_matches = matches[matches.away_team_api_id == this_match.away_team_api_id]
    home_offensive_score, home_defensive_score = get_home_score(home_matches)
    away_offensive_score, away_defensive_score = get_away_score(away_matches)
    return (home_offensive_score, home_defensive_score, away_offensive_score, away_defensive_score)

def get_home_score(matches):
    total_offensive_score = 0
    total_defensive_score = 0
    for index, match in enumerate(matches.itertuples(index=False, name='Pandas')):
        offensive_score = match.home_team_goal / (index + 1)
        defensive_score = match.away_team_goal / (index + 1)
        total_offensive_score += offensive_score
        total_defensive_score -= defensive_score
    return (total_offensive_score, total_defensive_score)

def get_away_score(matches):
    total_offensive_score = 0
    total_defensive_score = 0
    for index, match in enumerate(matches.itertuples(index=False, name='Pandas')):
        offensive_score = match.away_team_goal / (index + 1)
        defensive_score = match.home_team_goal / (index + 1)
        total_offensive_score += offensive_score
        total_defensive_score -= defensive_score
    return (total_offensive_score, total_defensive_score)

In [132]:
good = with_team_names.apply(lambda x: list(get_team_score(x, with_team_names)), axis=1)

In [135]:
match_stats = pd.DataFrame(good.tolist(), columns = ['home_offensive', 'home_defensive', 'away_offensive', 'away_defensive'])
match_stats.head()

Unnamed: 0,home_offensive,home_defensive,away_offensive,away_defensive
0,0.0,0.0,0.0,0.0
1,1.0,-1.0,2.0,-1.0
2,3.5,-0.5,1.0,-0.5
3,2.833333,-1.333333,3.5,-1.75
4,5.75,-3.75,3.5,-1.75


In [140]:
new_frame = pd.concat([with_team_names, match_stats], axis=1)
new_frame.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,pct_win_prob,expected,resulty,other_expected,final_expected,label,home_offensive,home_defensive,away_offensive,away_defensive
0,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,...,0.739744,0.5,0.5,0.5,0.5,Draw,0.0,0.0,0.0,0.0
1,2494,1729,1729,2010/2011,1,2010-08-16 00:00:00,839802,10260,10261,3,...,0.763006,0.744386,1.0,0.863552,0.638373,Win,1.0,-1.0,2.0,-1.0
2,2913,1729,1729,2011/2012,13,2011-11-26 00:00:00,1024845,10260,10261,1,...,0.677591,0.648628,0.5,0.616338,0.62591,Draw,3.5,-0.5,1.0,-0.5
3,3353,1729,1729,2012/2013,19,2012-12-26 00:00:00,1229300,10260,10261,4,...,0.780312,0.779633,1.0,0.904709,0.668747,Win,2.833333,-1.333333,3.5,-1.75
4,3693,1729,1729,2013/2014,15,2013-12-07 00:00:00,1474518,10260,10261,0,...,0.636165,0.629053,0.0,0.466635,0.626356,Defeat,5.75,-3.75,3.5,-1.75


In [142]:
home_mebs = with_team_names.apply(lambda x: list(get_home_recent_score(x, with_team_names)), axis=1)

In [144]:
home_match_stats = pd.DataFrame(home_mebs.tolist(), columns = ['home_recent_offensive', 'home_recent_defensive', 'away_recent_offensive', 'away_recent_defensive'])
home_match_stats.head()

Unnamed: 0,home_recent_offensive,home_recent_defensive,away_recent_offensive,away_recent_defensive
0,0.0,0.0,0.0,0.0
1,6.933333,-1.416667,2.7,0.0
2,4.516667,-3.65,3.083333,-0.583333
3,5.0,-1.583333,4.933333,-3.983333
4,1.95,-2.316667,4.7,-4.383333


In [152]:
away_mebs = with_team_names.apply(lambda x: list(get_away_recent_score(x, with_team_names)), axis=1)
away_match_stats = pd.DataFrame(away_mebs.tolist(), columns = ['home_recent_offensive_away', 'home_recent_defensive_away', 'away_recent_offensive_away', 'away_recent_defensive_away'])
away_match_stats.head()

Unnamed: 0,home_recent_offensive_away,home_recent_defensive_away,away_recent_offensive_away,away_recent_defensive_away
0,0.0,0.0,0.0,0.0
1,1.7,-2.6,0.45,-3.283333
2,4.316667,-2.116667,3.416667,-4.083333
3,3.083333,-2.366667,1.75,-3.916667
4,4.566667,-2.6,1.733333,-4.516667


In [153]:
final_frame = pd.concat([new_frame, home_match_stats, away_match_stats], axis=1)
final_frame.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,away_offensive,away_defensive,home_recent_offensive,home_recent_defensive,away_recent_offensive,away_recent_defensive,home_recent_offensive_away,home_recent_defensive_away,away_recent_offensive_away,away_recent_defensive_away
0,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2494,1729,1729,2010/2011,1,2010-08-16 00:00:00,839802,10260,10261,3,...,2.0,-1.0,6.933333,-1.416667,2.7,0.0,1.7,-2.6,0.45,-3.283333
2,2913,1729,1729,2011/2012,13,2011-11-26 00:00:00,1024845,10260,10261,1,...,1.0,-0.5,4.516667,-3.65,3.083333,-0.583333,4.316667,-2.116667,3.416667,-4.083333
3,3353,1729,1729,2012/2013,19,2012-12-26 00:00:00,1229300,10260,10261,4,...,3.5,-1.75,5.0,-1.583333,4.933333,-3.983333,3.083333,-2.366667,1.75,-3.916667
4,3693,1729,1729,2013/2014,15,2013-12-07 00:00:00,1474518,10260,10261,0,...,3.5,-1.75,1.95,-2.316667,4.7,-4.383333,4.566667,-2.6,1.733333,-4.516667


In [155]:
final_frame.columns.values

array(['id', 'country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal', 'away_team_goal', 'home_player_X1',
       'home_player_X2', 'home_player_X3', 'home_player_X4',
       'home_player_X5', 'home_player_X6', 'home_player_X7',
       'home_player_X8', 'home_player_X9', 'home_player_X10',
       'home_player_X11', 'away_player_X1', 'away_player_X2',
       'away_player_X3', 'away_player_X4', 'away_player_X5',
       'away_player_X6', 'away_player_X7', 'away_player_X8',
       'away_player_X9', 'away_player_X10', 'away_player_X11',
       'home_player_Y1', 'home_player_Y2', 'home_player_Y3',
       'home_player_Y4', 'home_player_Y5', 'home_player_Y6',
       'home_player_Y7', 'home_player_Y8', 'home_player_Y9',
       'home_player_Y10', 'home_player_Y11', 'away_player_Y1',
       'away_player_Y2', 'away_player_Y3', 'away_player_Y4',
       'away_player_Y5', 'away_player_Y6', 'away_player_Y7',
       'aw

In [157]:
X = final_frame[['expected', 'other_expected',
       'final_expected', 'home_offensive', 'home_defensive',
       'away_offensive', 'away_defensive', 'home_recent_offensive',
       'home_recent_defensive', 'away_recent_offensive',
       'away_recent_defensive', 'home_recent_offensive_away',
       'home_recent_defensive_away', 'away_recent_offensive_away',
       'away_recent_defensive_away']].values
y = final_frame.label.values

In [159]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [160]:
lr.predict(X)

array(['Win', 'Win', 'Win', ..., 'Win', 'Defeat', 'Defeat'], dtype=object)

In [161]:
lr.predict_proba(X)

array([[0.22983688, 0.2402297 , 0.52993343],
       [0.09988525, 0.15518307, 0.74493168],
       [0.14176646, 0.22395911, 0.63427443],
       ...,
       [0.35301965, 0.23931507, 0.40766528],
       [0.46162257, 0.26087584, 0.27750159],
       [0.40159702, 0.26411733, 0.33428564]])

In [163]:
lr.score(X, y)

0.5190789473684211

In [165]:
lr.predict_proba(X)[-5:]

array([[0.4529658 , 0.30747051, 0.23956369],
       [0.38387643, 0.26157754, 0.35454604],
       [0.35301965, 0.23931507, 0.40766528],
       [0.46162257, 0.26087584, 0.27750159],
       [0.40159702, 0.26411733, 0.33428564]])

In [166]:
y[-5:]

array(['Draw', 'Win', 'Draw', 'Win', 'Defeat'], dtype=object)

In [169]:
final_frame[['home_team_goal', 'away_team_goal', 'team_long_name', 'team_long_name_away']].tail()

Unnamed: 0,home_team_goal,away_team_goal,team_long_name,team_long_name_away
3035,2,2,Cardiff City,Manchester United
3036,5,3,Leicester City,Manchester United
3037,1,1,Leicester City,Manchester United
3038,2,1,Bournemouth,Manchester United
3039,1,2,Watford,Manchester United


In [173]:
final_frame.loc[:, 'pct_draw_prob'] = (1./ final_frame.B365D) / final_frame.total_prob
final_frame.loc[:, 'pct_lose_prob'] = (1./ final_frame.B365A) / final_frame.total_prob
final_frame[['pct_win_prob','pct_draw_prob', 'pct_lose_prob']].head()

Unnamed: 0,pct_win_prob,pct_draw_prob,pct_lose_prob
0,0.739744,0.173504,0.086752
1,0.763006,0.17341,0.063584
2,0.677591,0.210806,0.111603
3,0.780312,0.146459,0.073229
4,0.636165,0.224788,0.139047


In [181]:
lr.predict_proba(X)[-5:]

array([[0.4529658 , 0.30747051, 0.23956369],
       [0.38387643, 0.26157754, 0.35454604],
       [0.35301965, 0.23931507, 0.40766528],
       [0.46162257, 0.26087584, 0.27750159],
       [0.40159702, 0.26411733, 0.33428564]])

In [182]:
final_frame[['pct_win_prob', 'pct_draw_prob', 'pct_lose_prob']].tail()

Unnamed: 0,pct_win_prob,pct_draw_prob,pct_lose_prob
3035,0.162963,0.244444,0.592593
3036,0.178478,0.233721,0.587801
3037,0.314725,0.278756,0.406519
3038,0.2875,0.2875,0.425
3039,0.2253,0.286926,0.487774


In [192]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

ohe = OneHotEncoder()
le = LabelEncoder()
better = ohe.fit_transform(le.fit_transform(y).reshape(-1, 1))

In [199]:
from sklearn.metrics import log_loss

log_loss(better.toarray(), lr.predict_proba(X))

0.9905079832393577

In [200]:
log_loss(better.toarray(), final_frame[['pct_win_prob', 'pct_draw_prob', 'pct_lose_prob']].values)

1.4436242890429998

In [201]:
from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=3, test_size=.2, random_state=0)
for train_index, test_index in rs.split(X):
    train_x = X[train_index]
    train_y = y[train_index]
    test_x = X[test_index]
    test_y = y[test_index]
    lr = LogisticRegression()
    lr.fit(train_x, train_y)
    
    ohe = OneHotEncoder()
    le = LabelEncoder()
    better = ohe.fit_transform(le.fit_transform(train_y).reshape(-1, 1))
    
    print("Train loss: {0:.3f}".format(log_loss(better.toarray(), lr.predict_proba(train_x))))
    
    test_better = ohe.transform(le.transform(test_y).reshape(-1, 1))
    
    print("Test loss: {0:.3f}".format(log_loss(test_better.toarray(), lr.predict_proba(test_x))))

Train loss: 0.992
Test loss: 0.992
Train loss: 0.997
Test loss: 0.970
Train loss: 0.984
Test loss: 1.021


In [229]:
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import ShuffleSplit

rs = ShuffleSplit(n_splits=3, test_size=.2, random_state=0)
for train_index, test_index in rs.split(X):
    train_x = X[train_index]
    train_y = y[train_index]
    test_x = X[test_index]
    test_y = y[test_index]
    lr = LogisticRegression()
    #pf = PolynomialFeatures()
    #new_train_x = pf.fit_transform(train_x)
    #new_test_x = pf.transform(test_x)
    lr.fit(train_x, train_y)
    
    ohe = OneHotEncoder()
    le = LabelEncoder()
    better = ohe.fit_transform(le.fit_transform(train_y).reshape(-1, 1))
    
    print("Train loss: {0:.3f}".format(log_loss(better.toarray(), lr.predict_proba(train_x))))
    
    test_better = ohe.transform(le.transform(test_y).reshape(-1, 1))
    
    print("Test loss: {0:.3f}".format(log_loss(test_better.toarray(), lr.predict_proba(test_x))))

Train loss: 0.992
Test loss: 0.992
Train loss: 0.997
Test loss: 0.970
Train loss: 0.984
Test loss: 1.021


In [237]:
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import ShuffleSplit

from sklearn.ensemble import RandomForestClassifier

rs = ShuffleSplit(n_splits=3, test_size=.1, random_state=382)
for train_index, test_index in rs.split(X):
    train_x = X[train_index]
    train_y = y[train_index]
    test_x = X[test_index]
    test_y = y[test_index]
    rf = RandomForestClassifier(max_depth=3, min_samples_split=3, min_samples_leaf=2)
    pf = PolynomialFeatures()
    new_train_x = pf.fit_transform(train_x)
    new_test_x = pf.transform(test_x)
    rf.fit(new_train_x, train_y)
    
    ohe = OneHotEncoder()
    le = LabelEncoder()
    better = ohe.fit_transform(le.fit_transform(train_y).reshape(-1, 1))
    
    print("Train loss: {0:.3f}".format(log_loss(better.toarray(), rf.predict_proba(new_train_x))))
    
    test_better = ohe.transform(le.transform(test_y).reshape(-1, 1))
    
    print("Test loss: {0:.3f}".format(log_loss(test_better.toarray(), rf.predict_proba(new_test_x))))

Train loss: 0.975
Test loss: 1.021
Train loss: 0.971
Test loss: 1.027
Train loss: 0.976
Test loss: 0.998


In [227]:
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import ShuffleSplit

from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier

rs = ShuffleSplit(n_splits=3, test_size=.1, random_state=18)
for train_index, test_index in rs.split(X):
    train_x = X[train_index]
    train_y = y[train_index]
    test_x = X[test_index]
    test_y = y[test_index]
    gb = GradientBoostingClassifier(subsample=0.65, min_samples_leaf=3, min_samples_split=4)
    #pf = PolynomialFeatures(interaction_only=True, include_bias=False)
    #new_train_x = pf.fit_transform(train_x)
    #new_test_x = pf.transform(test_x)
    gb.fit(train_x, train_y)
    
    ohe = OneHotEncoder()
    le = LabelEncoder()
    better = ohe.fit_transform(le.fit_transform(train_y).reshape(-1, 1))
    
    print("Train loss: {0:.3f}".format(log_loss(better.toarray(), gb.predict_proba(train_x))))
    
    test_better = ohe.transform(le.transform(test_y).reshape(-1, 1))
    
    print("Test loss: {0:.3f}".format(log_loss(test_better.toarray(), gb.predict_proba(test_x))))

Train loss: 0.780
Test loss: 0.964
Train loss: 0.778
Test loss: 0.976
Train loss: 0.773
Test loss: 1.010


In [239]:
for label, coef in zip(['bias', 'expected', 'other_expected',
       'final_expected', 'home_offensive', 'home_defensive',
       'away_offensive', 'away_defensive', 'home_recent_offensive',
       'home_recent_defensive', 'away_recent_offensive',
       'away_recent_defensive', 'home_recent_offensive_away',
       'home_recent_defensive_away', 'away_recent_offensive_away',
       'away_recent_defensive_away'], lr.coef_[0]):
    print("{0} : {1:.2f}".format(label, coef))

bias : -3.26
expected : 0.75
other_expected : -2.58
final_expected : 0.01
home_offensive : -0.06
home_defensive : -0.07
away_offensive : -0.01
away_defensive : -0.01
home_recent_offensive : -0.02
home_recent_defensive : -0.05
away_recent_offensive : -0.00
away_recent_defensive : 0.05
home_recent_offensive_away : 0.01
home_recent_defensive_away : -0.00
away_recent_offensive_away : 0.01


In [242]:
lr.predict_proba(X)[:5]

array([[0.25491503, 0.22298487, 0.5221001 ],
       [0.10223591, 0.15280609, 0.744958  ],
       [0.14887169, 0.21451982, 0.63660849],
       [0.0836525 , 0.19070104, 0.72564646],
       [0.14334423, 0.25103377, 0.605622  ]])

In [243]:
final_frame[['pct_win_prob', 'pct_draw_prob', 'pct_lose_prob']].values[:5]

array([[0.73974445, 0.1735037 , 0.08675185],
       [0.76300578, 0.1734104 , 0.06358382],
       [0.67759079, 0.21080602, 0.11160319],
       [0.78031212, 0.14645858, 0.07322929],
       [0.63616463, 0.22478796, 0.13904741]])

In [297]:
draw_predictions = final_frame[(lr.predict_proba(X)[:, 1] - final_frame.pct_draw_prob ) <= 0.]

In [298]:
BET = 1
RUNNING_TOTAL = 0
CORRECT_COUNT = 0

for index, row in draw_predictions.sort_values(by='date').iterrows():
    odds = row.B365D
    result = row.label
    if result == 'Draw':
        CORRECT_COUNT += 1
        RUNNING_TOTAL += odds*BET
    else:
        RUNNING_TOTAL -= BET

In [299]:
print("Money made: {0:.2f}".format(RUNNING_TOTAL))
print("Correct count: {0}".format(CORRECT_COUNT))

Money made: 328.65
Correct count: 442


In [278]:
final_frame.sort_values(by='B365D').head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,home_recent_offensive,home_recent_defensive,away_recent_offensive,away_recent_defensive,home_recent_offensive_away,home_recent_defensive_away,away_recent_offensive_away,away_recent_defensive_away,pct_draw_prob,pct_lose_prob
1131,2805,1729,1729,2010/2011,38,2011-05-22 00:00:00,840231,8602,8655,2,...,5.05,-3.25,3.95,-3.5,2.15,-1.983333,1.7,-3.55,0.316092,0.252874
2952,2049,1729,1729,2008/2009,4,2008-09-13 00:00:00,489072,8650,10260,2,...,2.0,-1.0,0.5,0.0,1.0,-1.0,1.0,0.0,0.317073,0.365854
19,4256,1729,1729,2014/2015,31,2015-04-05 00:00:00,1724289,8472,10261,1,...,0.5,-4.866667,0.95,-2.816667,1.783333,-3.15,1.083333,-6.233333,0.308642,0.308642
1491,4180,1729,1729,2014/2015,25,2015-02-10 00:00:00,1724223,8667,10252,2,...,1.0,-3.583333,1.75,-3.483333,1.533333,-3.366667,0.0,-6.083333,0.313653,0.277807
1037,2844,1729,1729,2010/2011,7,2010-10-02 00:00:00,839879,8658,8668,0,...,1.166667,-0.583333,2.916667,-5.366667,2.483333,-3.033333,0.6,-1.233333,0.307211,0.362111


In [281]:
with_team_names[['team_long_name', 'team_long_name_away', 'season', 'home_team_goal', 'away_team_goal', 'B365H', 'B365D', 'B365A']].sort_values(by='B365D').head()

Unnamed: 0,team_long_name,team_long_name_away,season,home_team_goal,away_team_goal,B365H,B365D,B365A
1131,Wolverhampton Wanderers,Blackburn Rovers,2010/2011,2,3,2.2,3.0,3.75
2952,Liverpool,Manchester United,2008/2009,2,1,3.0,3.0,2.6
19,Sunderland,Newcastle United,2014/2015,1,0,2.5,3.1,3.1
1491,Hull City,Aston Villa,2014/2015,2,0,2.38,3.1,3.5
1037,Birmingham City,Everton,2010/2011,0,2,2.88,3.1,2.63


In [282]:
with_team_names[['team_long_name', 'team_long_name_away', 'season', 'home_team_goal', 'away_team_goal', 'B365H', 'B365D', 'B365A']].sort_values(by='B365D', ascending=False).head()

Unnamed: 0,team_long_name,team_long_name_away,season,home_team_goal,away_team_goal,B365H,B365D,B365A
2750,Manchester City,Cardiff City,2013/2014,4,2,1.13,11.0,23.0
683,Chelsea,Wigan Athletic,2009/2010,8,0,1.1,10.0,23.0
2794,Manchester City,Crystal Palace,2013/2014,1,0,1.13,9.5,29.0
1551,Manchester City,Aston Villa,2013/2014,4,0,1.14,9.5,21.0
601,Manchester City,Sunderland,2013/2014,2,2,1.17,9.0,17.0


In [284]:
final_frame.groupby('label').size() / final_frame.shape[0]

label
Defeat    0.285197
Draw      0.257566
Win       0.457237
dtype: float64

In [286]:
final_frame.pct_draw_prob.describe()

count    3040.000000
mean        0.253598
std         0.044537
min         0.089184
25%         0.233301
50%         0.270977
75%         0.287367
max         0.317181
Name: pct_draw_prob, dtype: float64

In [296]:
final_frame.loc[:, 'total_odds'] = final_frame[['B365H', 'B365D', 'B365A']].sum(axis=1)
final_frame.sort_values(by='total_odds', ascending=True).head()[['team_long_name', 'team_long_name_away', 'season', 'home_team_goal', 'away_team_goal', 'B365H', 'B365D', 'B365A']]

Unnamed: 0,team_long_name,team_long_name_away,season,home_team_goal,away_team_goal,B365H,B365D,B365A
2793,West Bromwich Albion,Crystal Palace,2015/2016,3,2,2.6,3.1,2.8
2519,Newcastle United,Swansea City,2013/2014,1,2,2.63,3.3,2.6
903,Southampton,Chelsea,2015/2016,1,2,2.8,3.25,2.5
2492,Aston Villa,Swansea City,2014/2015,0,1,2.5,3.25,2.8
1012,Portsmouth,Everton,2008/2009,2,1,2.75,3.2,2.62


In [300]:
final_frame.groupby('season').size()

season
2008/2009    380
2009/2010    380
2010/2011    380
2011/2012    380
2012/2013    380
2013/2014    380
2014/2015    380
2015/2016    380
dtype: int64