In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

## Extract FIFA International Soccer Data

In [2]:
results = pd.read_csv("data/results.csv")

#### Clean/Encode Data!

In [3]:
# Extract only Year from datetime conversion object
results['date'] = pd.to_datetime(results['date']).dt.year

In [4]:
# Pull recent DATA (-20 years)
results = results[results['date'] >= 2005]
results = results[results['date'] <= 2024]

In [29]:
# ADD a clear Winner Column via Integer encoding
if_conditions = [(results['home_score'] > results['away_score']),
                 (results['home_score'] == results['away_score']),
                 (results['home_score'] < results['away_score'])
                 ]
outcomes = [0, 1, 2]  #[0 - HOME team W,  1 - Draw [Tie],  2 - AWAY team W]

#results['winning_team'] = np.where(results['home_score'] > results['away_score'], results['home_team'], results['away_team'])   # in-line Panda DataFrame IF
results['result'] = np.select(if_conditions, outcomes)   # in-line Panda DataFrame IF
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,result
28921,2005,250,186,4,2,5,Singapore,Singapore,False,0
28922,2005,166,130,1,4,5,Kuala Lumpur,Malaysia,False,2
28923,2005,83,290,3,0,51,Cairo,Egypt,False,0
28924,2005,130,250,1,3,5,Jakarta,Indonesia,False,2
28925,2005,139,100,5,0,20,Kingston,Jamaica,False,0


In [6]:
# Encode STRING Country's to IDs

# Grab all unique teams List from results Columns
all_teams = pd.concat([results['home_team'], results['away_team']]).unique()

encoder = LabelEncoder()
encoder.fit(all_teams)

# Encode Countries to same id's across these two columns
results['home_team'] = encoder.transform(results['home_team'])
results['away_team'] = encoder.transform(results['away_team'])
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,result
28921,2005,250,186,4,2,AFF Championship,Singapore,Singapore,False,0
28922,2005,166,130,1,4,AFF Championship,Kuala Lumpur,Malaysia,False,2
28923,2005,83,290,3,0,Friendly,Cairo,Egypt,False,0
28924,2005,130,250,1,3,AFF Championship,Jakarta,Indonesia,False,2
28925,2005,139,100,5,0,CFU Caribbean Cup qualification,Kingston,Jamaica,False,0


In [7]:
# Encode Tournament Column (if we are using it)
encoder = LabelEncoder()
encoder.fit(results['tournament'].unique())

results['tournament'] = encoder.transform(results['tournament'])
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,result
28921,2005,250,186,4,2,5,Singapore,Singapore,False,0
28922,2005,166,130,1,4,5,Kuala Lumpur,Malaysia,False,2
28923,2005,83,290,3,0,51,Cairo,Egypt,False,0
28924,2005,130,250,1,3,5,Jakarta,Indonesia,False,2
28925,2005,139,100,5,0,20,Kingston,Jamaica,False,0


In [25]:
# Train/Test Split
train, test = train_test_split(results, test_size=0.2, random_state=0)

x_train = train[['home_team', 'away_team', 'tournament']]  # before the game 'starts'
y_train = train['result']  # 'after' game

x_test = test[['home_team', 'away_team', 'tournament']]   # before the game 'starts'
y_test =  test['result']

### Decision Tree Baseline Creation

In [26]:
dt = DecisionTreeClassifier(max_depth=10, random_state=0)
dt.fit(x_train, y_train)

prediction = dt.predict(x_test)

In [27]:
metrics.accuracy_score(prediction,y_test)

0.47509176717357104

### Future processing/improvement Ideas:  Cut Neutral turf games? Cut friendly? Include shootouts?  K-Fold validation??  Add SPECIFIC Tournament Weighting? Removing DRAWs from predictions?