In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

## Extract Premier League Soccer Data

In [None]:
results = pd.read_csv("data/results.csv")

In [None]:
premier_results = pd.read_csv("premier_data/premier_matches.csv")

#### Clean/Encode Data!

In [None]:
# drop betting odds
cols_to_drop = [
    "C_HTB", "MaxOver25", "MaxUnder25", "HandiSize", "HandiHome",
    "HandiAway", "C_LTH", "C_LTA", "C_VHD", "C_VAD", "C_PHB",
    "OddHome",	"OddDraw",	"OddAway",	"MaxHome",	"MaxDraw",	"MaxAway",	"Over25",	"Under25",
    "MatchTime" # and some odd TIME cols
]
premier_results = premier_results.drop(columns=cols_to_drop)

In [None]:
# Grab PREMIER LEAGUE data
premier_results = premier_results[premier_results['Division'] == 'E0'].copy()
premier_results.head(20)

In [None]:
# Extract only Year from datetime conversion object
premier_results['MatchDate'] = pd.to_datetime(premier_results['MatchDate']).dt.year

In [None]:
# Pull recent DATA (-20 years)
premier_results = premier_results[premier_results['MatchDate'] >= 2005]
premier_results = premier_results[premier_results['MatchDate'] <= 2024]

In [None]:
# Grab ONLY Premier League Data - Division Code = E0
premier_results = premier_results[premier_results['Division'] == 'E0']
pd.set_option('display.max_columns', None)
premier_results.head(15)

In [None]:
# Drop DRAWs
premier_results = premier_results[premier_results['FTResult'] != 'D'].copy()

print(f"W/L ROWS: {premier_results.shape[0]}")
premier_results.head()

In [None]:
# Map H/A to 0/1
# H (Home Win) = 0
# A (Away Win) = 1
premier_results['FTResult'] = premier_results['FTResult'].map({'H': 0, 'A': 1})  # AWAY WINS weighted HIGHER?

premier_results.head(20)

In [None]:
# Columns used here
premier_results[['HomeElo',	'AwayElo']].head()

In [None]:
# Train/Test Split
train, test = train_test_split(premier_results, test_size=0.2, random_state=0)

x_train = train[['HomeElo',	'AwayElo']]  # before the game 'starts'
y_train = train['FTResult']  # 'after' game

x_test = test[['HomeElo',	'AwayElo']]   # before the game 'starts'
y_test =  test['FTResult']

In [None]:
"""
Potential Future TESTER
# Train/Test Split
train, test = train_test_split(premier_results, test_size=0.2, random_state=0)

x_train = train[['HomeElo',	'AwayElo', 'OddHome', 'OddAway']]  # before the game 'starts'
y_train = train['FTResult']  # 'after' game

x_test = test[['HomeElo',	'AwayElo', 'OddHome', 'OddAway']]   # before the game 'starts'
y_test =  test['FTResult']

"""

### Decision Tree Baseline Creation

In [None]:
dt = DecisionTreeClassifier(max_depth=5, random_state=0)
dt.fit(x_train, y_train)

prediction = dt.predict(x_test)

In [39]:
metrics.accuracy_score(prediction,y_test)

0.6202974628171478

In [15]:
print(x_test.columns)
dt.feature_importances_

Index(['HomeElo', 'AwayElo'], dtype='object')


array([0.4770253, 0.5229747])

### Future processing/improvement Ideas:  K-Fold validation??  Removing DRAWs from predictions?

### Some Future Columns to include: [OddHome', 'OddAway', Form3Home	Form5Home	Form3Away	Form5Away] & potentially other valid Betting Columns