In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

## Extract Premier League Soccer Data

In [65]:
premier_results = pd.read_csv("premier_data/premier_matches.csv")

  premier_results = pd.read_csv("premier_data/premier_matches.csv")


#### Clean/Encode Data!

In [66]:
# drop betting odds
cols_to_drop = [
    "C_HTB", "MaxOver25", "MaxUnder25", "HandiSize", "HandiHome",
    "HandiAway", "C_LTH", "C_LTA", "C_VHD", "C_VAD", "C_PHB",
    "OddHome",	"OddDraw",	"OddAway",	"MaxHome",	"MaxDraw",	"MaxAway",	"Over25",	"Under25",
    "MatchTime" # and some odd TIME cols
]
premier_results = premier_results.drop(columns=cols_to_drop)

In [67]:
# Grab PREMIER LEAGUE data
premier_results = premier_results[premier_results['Division'] == 'E0'].copy()
premier_results.head(20)

Unnamed: 0,Division,MatchDate,HomeTeam,AwayTeam,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,Form5Away,FTHome,FTAway,FTResult,HTHome,HTAway,HTResult,HomeShots,AwayShots,HomeTarget,AwayTarget,HomeFouls,AwayFouls,HomeCorners,AwayCorners,HomeYellow,AwayYellow,HomeRed,AwayRed
154,E0,2000-08-19,Charlton,Man City,1608.77,1579.99,0.0,0.0,0.0,0.0,4.0,0.0,H,2.0,0.0,H,17.0,8.0,14.0,4.0,13.0,12.0,6.0,6.0,1.0,2.0,0.0,0.0
155,E0,2000-08-19,Chelsea,West Ham,1800.17,1681.36,0.0,0.0,0.0,0.0,4.0,2.0,H,1.0,0.0,H,17.0,12.0,10.0,5.0,19.0,14.0,7.0,7.0,1.0,2.0,0.0,0.0
156,E0,2000-08-19,Coventry,Middlesbrough,1635.61,1679.18,0.0,0.0,0.0,0.0,1.0,3.0,A,1.0,1.0,D,6.0,16.0,3.0,9.0,15.0,21.0,8.0,4.0,5.0,3.0,1.0,0.0
157,E0,2000-08-19,Derby,Southampton,1636.08,1630.02,0.0,0.0,0.0,0.0,2.0,2.0,D,1.0,2.0,A,6.0,13.0,4.0,6.0,11.0,13.0,5.0,8.0,1.0,1.0,0.0,0.0
158,E0,2000-08-19,Leeds,Everton,1782.55,1685.55,0.0,0.0,0.0,0.0,2.0,0.0,H,2.0,0.0,H,17.0,12.0,8.0,6.0,21.0,20.0,6.0,4.0,1.0,3.0,0.0,0.0
159,E0,2000-08-19,Leicester,Aston Villa,1687.89,1728.2,0.0,0.0,0.0,0.0,0.0,0.0,D,0.0,0.0,D,5.0,5.0,4.0,3.0,12.0,12.0,5.0,4.0,2.0,3.0,0.0,0.0
160,E0,2000-08-19,Liverpool,Bradford,1770.15,1578.72,0.0,0.0,0.0,0.0,1.0,0.0,H,0.0,0.0,D,16.0,3.0,10.0,2.0,8.0,8.0,6.0,1.0,1.0,1.0,0.0,0.0
161,E0,2000-08-19,Sunderland,Arsenal,1712.13,1871.71,0.0,0.0,0.0,0.0,1.0,0.0,H,0.0,0.0,D,8.0,14.0,2.0,7.0,10.0,21.0,2.0,9.0,3.0,1.0,0.0,1.0
162,E0,2000-08-19,Tottenham,Ipswich,1676.8,1638.08,0.0,0.0,0.0,0.0,3.0,1.0,H,2.0,1.0,H,20.0,15.0,6.0,5.0,14.0,13.0,3.0,4.0,0.0,0.0,0.0,0.0
220,E0,2000-08-20,Man United,Newcastle,1931.95,1712.19,0.0,0.0,0.0,0.0,2.0,0.0,H,1.0,0.0,H,19.0,9.0,9.0,6.0,7.0,13.0,7.0,1.0,0.0,1.0,0.0,0.0


In [68]:
# Extract only Year from datetime conversion object
premier_results['MatchDate'] = pd.to_datetime(premier_results['MatchDate']).dt.year

In [69]:
# Pull recent DATA (-20 years)
premier_results = premier_results[premier_results['MatchDate'] >= 2005]
premier_results = premier_results[premier_results['MatchDate'] <= 2024]

In [70]:
# Grab ONLY Premier League Data - Division Code = E0
premier_results = premier_results[premier_results['Division'] == 'E0']
pd.set_option('display.max_columns', None)
premier_results.head(15)

Unnamed: 0,Division,MatchDate,HomeTeam,AwayTeam,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,Form5Away,FTHome,FTAway,FTResult,HTHome,HTAway,HTResult,HomeShots,AwayShots,HomeTarget,AwayTarget,HomeFouls,AwayFouls,HomeCorners,AwayCorners,HomeYellow,AwayYellow,HomeRed,AwayRed
24809,E0,2005,Aston Villa,Blackburn,1709.45,1663.23,0.0,1.0,5.0,6.0,1.0,0.0,H,0.0,0.0,D,11.0,9.0,3.0,4.0,9.0,9.0,6.0,7.0,1.0,0.0,0.0,0.0
24810,E0,2005,Bolton,West Brom,1674.83,1537.66,0.0,0.0,1.0,1.0,1.0,1.0,D,0.0,1.0,A,25.0,11.0,15.0,7.0,13.0,15.0,7.0,3.0,0.0,1.0,0.0,0.0
24811,E0,2005,Charlton,Arsenal,1703.99,1913.54,7.0,13.0,9.0,13.0,1.0,3.0,A,1.0,1.0,D,7.0,8.0,4.0,5.0,13.0,15.0,1.0,2.0,1.0,1.0,0.0,0.0
24812,E0,2005,Fulham,Crystal Palace,1658.58,1570.47,0.0,4.0,1.0,2.0,3.0,1.0,H,1.0,1.0,D,12.0,7.0,6.0,3.0,11.0,18.0,4.0,2.0,1.0,3.0,0.0,0.0
24813,E0,2005,Liverpool,Chelsea,1807.79,1919.25,9.0,10.0,9.0,13.0,0.0,1.0,A,0.0,0.0,D,14.0,8.0,7.0,7.0,14.0,16.0,3.0,2.0,2.0,2.0,0.0,0.0
24814,E0,2005,Man City,Southampton,1684.93,1627.79,4.0,4.0,1.0,2.0,2.0,1.0,H,2.0,0.0,H,15.0,6.0,8.0,1.0,12.0,9.0,9.0,8.0,0.0,0.0,0.0,0.0
24815,E0,2005,Middlesbrough,Man United,1749.94,1875.94,6.0,10.0,9.0,13.0,0.0,2.0,A,0.0,1.0,A,6.0,13.0,2.0,7.0,13.0,9.0,10.0,4.0,1.0,0.0,0.0,0.0
24816,E0,2005,Newcastle,Birmingham,1741.52,1697.41,1.0,2.0,9.0,12.0,2.0,1.0,H,2.0,0.0,H,8.0,10.0,4.0,8.0,8.0,14.0,6.0,5.0,1.0,2.0,0.0,0.0
24817,E0,2005,Portsmouth,Norwich,1671.87,1591.91,3.0,5.0,0.0,3.0,1.0,1.0,D,0.0,1.0,A,14.0,6.0,6.0,5.0,13.0,6.0,7.0,5.0,3.0,0.0,0.0,1.0
24818,E0,2005,Tottenham,Everton,1693.87,1719.4,7.0,13.0,4.0,10.0,5.0,2.0,H,2.0,1.0,H,14.0,6.0,12.0,3.0,8.0,13.0,6.0,4.0,0.0,0.0,0.0,0.0


In [71]:
# Drop DRAWs
premier_results = premier_results[premier_results['FTResult'] != 'D'].copy()

print(f"W/L ROWS: {premier_results.shape[0]}")
premier_results.head()

W/L ROWS: 5711


Unnamed: 0,Division,MatchDate,HomeTeam,AwayTeam,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,Form5Away,FTHome,FTAway,FTResult,HTHome,HTAway,HTResult,HomeShots,AwayShots,HomeTarget,AwayTarget,HomeFouls,AwayFouls,HomeCorners,AwayCorners,HomeYellow,AwayYellow,HomeRed,AwayRed
24809,E0,2005,Aston Villa,Blackburn,1709.45,1663.23,0.0,1.0,5.0,6.0,1.0,0.0,H,0.0,0.0,D,11.0,9.0,3.0,4.0,9.0,9.0,6.0,7.0,1.0,0.0,0.0,0.0
24811,E0,2005,Charlton,Arsenal,1703.99,1913.54,7.0,13.0,9.0,13.0,1.0,3.0,A,1.0,1.0,D,7.0,8.0,4.0,5.0,13.0,15.0,1.0,2.0,1.0,1.0,0.0,0.0
24812,E0,2005,Fulham,Crystal Palace,1658.58,1570.47,0.0,4.0,1.0,2.0,3.0,1.0,H,1.0,1.0,D,12.0,7.0,6.0,3.0,11.0,18.0,4.0,2.0,1.0,3.0,0.0,0.0
24813,E0,2005,Liverpool,Chelsea,1807.79,1919.25,9.0,10.0,9.0,13.0,0.0,1.0,A,0.0,0.0,D,14.0,8.0,7.0,7.0,14.0,16.0,3.0,2.0,2.0,2.0,0.0,0.0
24814,E0,2005,Man City,Southampton,1684.93,1627.79,4.0,4.0,1.0,2.0,2.0,1.0,H,2.0,0.0,H,15.0,6.0,8.0,1.0,12.0,9.0,9.0,8.0,0.0,0.0,0.0,0.0


In [72]:
# Map H/A to 0/1
# H (Home Win) = 0
# A (Away Win) = 1
premier_results['FTResult'] = premier_results['FTResult'].map({'H': 0, 'A': 1})  # AWAY WINS weighted HIGHER?

premier_results.head(20)

Unnamed: 0,Division,MatchDate,HomeTeam,AwayTeam,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,Form5Away,FTHome,FTAway,FTResult,HTHome,HTAway,HTResult,HomeShots,AwayShots,HomeTarget,AwayTarget,HomeFouls,AwayFouls,HomeCorners,AwayCorners,HomeYellow,AwayYellow,HomeRed,AwayRed
24809,E0,2005,Aston Villa,Blackburn,1709.45,1663.23,0.0,1.0,5.0,6.0,1.0,0.0,0,0.0,0.0,D,11.0,9.0,3.0,4.0,9.0,9.0,6.0,7.0,1.0,0.0,0.0,0.0
24811,E0,2005,Charlton,Arsenal,1703.99,1913.54,7.0,13.0,9.0,13.0,1.0,3.0,1,1.0,1.0,D,7.0,8.0,4.0,5.0,13.0,15.0,1.0,2.0,1.0,1.0,0.0,0.0
24812,E0,2005,Fulham,Crystal Palace,1658.58,1570.47,0.0,4.0,1.0,2.0,3.0,1.0,0,1.0,1.0,D,12.0,7.0,6.0,3.0,11.0,18.0,4.0,2.0,1.0,3.0,0.0,0.0
24813,E0,2005,Liverpool,Chelsea,1807.79,1919.25,9.0,10.0,9.0,13.0,0.0,1.0,1,0.0,0.0,D,14.0,8.0,7.0,7.0,14.0,16.0,3.0,2.0,2.0,2.0,0.0,0.0
24814,E0,2005,Man City,Southampton,1684.93,1627.79,4.0,4.0,1.0,2.0,2.0,1.0,0,2.0,0.0,H,15.0,6.0,8.0,1.0,12.0,9.0,9.0,8.0,0.0,0.0,0.0,0.0
24815,E0,2005,Middlesbrough,Man United,1749.94,1875.94,6.0,10.0,9.0,13.0,0.0,2.0,1,0.0,1.0,A,6.0,13.0,2.0,7.0,13.0,9.0,10.0,4.0,1.0,0.0,0.0,0.0
24816,E0,2005,Newcastle,Birmingham,1741.52,1697.41,1.0,2.0,9.0,12.0,2.0,1.0,0,2.0,0.0,H,8.0,10.0,4.0,8.0,8.0,14.0,6.0,5.0,1.0,2.0,0.0,0.0
24818,E0,2005,Tottenham,Everton,1693.87,1719.4,7.0,13.0,4.0,10.0,5.0,2.0,0,2.0,1.0,H,14.0,6.0,12.0,3.0,8.0,13.0,6.0,4.0,0.0,0.0,0.0,0.0
24866,E0,2005,Blackburn,Charlton,1663.23,1703.99,4.0,6.0,4.0,10.0,1.0,0.0,0,1.0,0.0,H,16.0,9.0,11.0,6.0,11.0,14.0,6.0,8.0,2.0,3.0,0.0,0.0
24867,E0,2005,Crystal Palace,Aston Villa,1570.47,1709.45,1.0,2.0,3.0,3.0,2.0,0.0,0,1.0,0.0,H,14.0,12.0,9.0,6.0,8.0,14.0,7.0,7.0,2.0,3.0,0.0,0.0


In [73]:
# Columns used here
premier_results[['HomeElo',	'AwayElo']].head()

Unnamed: 0,HomeElo,AwayElo
24809,1709.45,1663.23
24811,1703.99,1913.54
24812,1658.58,1570.47
24813,1807.79,1919.25
24814,1684.93,1627.79


In [74]:
# Train/Test Split
train, test = train_test_split(premier_results, test_size=0.2, random_state=0)

x_train = train[['HomeElo',	'AwayElo']]  # before the game 'starts'
y_train = train['FTResult']  # 'after' game

x_test = test[['HomeElo',	'AwayElo']]   # before the game 'starts'
y_test =  test['FTResult']

In [None]:
"""
Potential Future TESTER
# Train/Test Split
train, test = train_test_split(premier_results, test_size=0.2, random_state=0)

x_train = train[['HomeElo',	'AwayElo', 'OddHome', 'OddAway']]  # before the game 'starts'
y_train = train['FTResult']  # 'after' game

x_test = test[['HomeElo',	'AwayElo', 'OddHome', 'OddAway']]   # before the game 'starts'
y_test =  test['FTResult']

"""

### Decision Tree Baseline Creation

In [88]:
dt = DecisionTreeClassifier(max_depth=7, random_state=0)
dt.fit(x_train, y_train)

prediction = dt.predict(x_test)

In [89]:
metrics.accuracy_score(prediction,y_test)

0.6911636045494314

In [90]:
print(x_test.columns)
dt.feature_importances_

Index(['HomeElo', 'AwayElo'], dtype='object')


array([0.48594124, 0.51405876])

### Future processing/improvement Ideas:  K-Fold validation??  Removing DRAWs from predictions?

### Some Future Columns to include: [OddHome', 'OddAway', Form3Home	Form5Home	Form3Away	Form5Away] & potentially other valid Betting Columns