# Football Match Probability Prediction

<h2>This notebook was made for an open competition on <a href = "https://www.kaggle.com/competitions/football-match-probability-prediction/overview" target="_blank">Kaggle</a> for predicting probability football matches.</h2>
<hr/>
<h4><b><i>The purpose of the competition is</i></b></h4 ><i>to predict the probability of the outcome of the match: multiclass target (home, away,draw)</i>

<h4><b><i>Models used:</i></b></h4> <i> Baseline - LogisticRegression, XGBClassifier, LGBMClassifier, CatBoostClassifier. 
    The best result was obtained using the Catboost with Optuna.</i>

<h4><b><i>Model metrics:</i></b></h4><i>multiclass log_loss</i>

<h4><b><i>Results:</i></b></h4><i> 1-st place in public <a href = "https://www.kaggle.com/competitions/football-match-probability-prediction/leaderboard" target="_blank">Leaderbord</a> with result 0.98834
<hr/>
<h3>This notebook was made by Nikolay Luzhynski and updoaded to github on 10 April, 2022</h3>
    <hr/>

## Library import

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression


from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import optuna

from tqdm.notebook import trange, tqdm
import warnings
warnings.filterwarnings('ignore')
RAND = 123

## Import my libraries

In [2]:
import sys
sys.path.insert(1, 'my_modules')

# this code include load data, convert to the true format and fill na-values
from FILL_NA import FillNa
# this code include feature engineering
from Features_engineering import FeaturesEngineering

## Data loading

In [3]:
ls ../datasets/

test.csv    test_2.csv  train.csv


In [4]:
custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
path_to_datasets = '../datasets/'
trainset = pd.read_csv(path_to_datasets+'train.csv', index_col='id',parse_dates=['match_date'],
                date_parser=custom_date_parser)
testset = pd.read_csv(path_to_datasets+'test.csv', index_col='id',parse_dates=['match_date'],
                date_parser=custom_date_parser)
trainset.shape, testset.shape

((110938, 189), (72711, 188))

In [5]:
coder = {'away':0,
         'home':1,
         'draw':2}
trainset['target'] = trainset['target'].map(coder)

In [6]:
testset.head(3)

Unnamed: 0_level_0,home_team_name,away_team_name,match_date,league_name,league_id,is_cup,home_team_coach_id,away_team_coach_id,home_team_history_match_date_1,home_team_history_match_date_2,...,away_team_history_league_id_1,away_team_history_league_id_2,away_team_history_league_id_3,away_team_history_league_id_4,away_team_history_league_id_5,away_team_history_league_id_6,away_team_history_league_id_7,away_team_history_league_id_8,away_team_history_league_id_9,away_team_history_league_id_10
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17761448,12 de Octubre,Sportivo Luqueño,2021-05-01 00:15:00,Division 1,755,False,9605704.0,30866250.0,2021-04-28 00:30:00,2021-04-23 23:00:00,...,755.0,755.0,755.0,755.0,755.0,755.0,755.0,755.0,755.0,755.0
17695487,Necaxa,Atlas,2021-05-01 00:30:00,Liga MX,743,False,10319425.0,460370.0,2021-04-24 02:06:00,2021-04-17 00:30:00,...,743.0,743.0,743.0,743.0,743.0,743.0,743.0,743.0,743.0,743.0
17715496,Sertãozinho,EC São Bernardo,2021-05-01 01:00:00,Paulista A2,1314,False,440905.0,220998.0,2021-04-28 23:00:00,2021-04-24 23:00:00,...,1314.0,1314.0,1314.0,1314.0,1314.0,1314.0,1314.0,1314.0,1315.0,1315.0


## Fill NA_values

In [7]:
%%time
# for more detail check the file FILL_NA.py
class_na = FillNa(trainset)
class_na.fill_na_descr_col()
class_na.fill_na_coachs()
class_na.fill_na_time()
class_na.fill_na_is_play_home()
class_na.fill_na_goal_team()
class_na.fill_na_goal_team_opponent()
class_na.fill_na_rating_team()
class_na.fill_na_rating_team_opponent()
class_na.fill_na_league()
class_na.fill_na_cup()
assert class_na.dataset.isna().sum().sum() == 0 
trainset_general = class_na.dataset

CPU times: user 38.8 s, sys: 3.2 s, total: 42 s
Wall time: 42.4 s


In [8]:
%%time
class_na = FillNa(testset)
class_na.fill_na_descr_col()
class_na.fill_na_coachs()
class_na.fill_na_time()
class_na.fill_na_is_play_home()
class_na.fill_na_goal_team()
class_na.fill_na_goal_team_opponent()
class_na.fill_na_rating_team()
class_na.fill_na_rating_team_opponent()
class_na.fill_na_league()
class_na.fill_na_cup()
assert class_na.dataset.isna().sum().sum() == 0 
testset_general = class_na.dataset

CPU times: user 24.2 s, sys: 1.24 s, total: 25.4 s
Wall time: 25.5 s


## Feature Engineering

In [9]:
dataset = pd.concat([trainset_general.drop('target', axis=1), testset_general], axis=0)
index_split_dataset = trainset_general.shape[0]
dataset.shape

(183648, 188)

In [10]:
%%time
# for more detail check the file Features_engineering.py
F_eng = FeaturesEngineering(dataset)
F_eng.Ft_eng_coach_id()
F_eng.Ft_eng_history_goal()
F_eng.Ft_eng_is_cup()
F_eng.Ft_eng_is_play_home()
F_eng.Ft_eng_match_date()
F_eng.Ft_eng_rating()
F_eng.Ft_eng_league_id()
F_eng.Ft_eng_win_percentage()

  0%|          | 0/183648 [00:00<?, ?it/s]

  0%|          | 0/183648 [00:00<?, ?it/s]

  0%|          | 0/183648 [00:00<?, ?it/s]

  0%|          | 0/183648 [00:00<?, ?it/s]

  0%|          | 0/183648 [00:00<?, ?it/s]

  0%|          | 0/183648 [00:00<?, ?it/s]

CPU times: user 6min 12s, sys: 7.07 s, total: 6min 19s
Wall time: 6min 19s


In [11]:
trainset = F_eng.dataset.iloc[:index_split_dataset,:]
trainset['target'] = trainset_general['target']
testset = F_eng.dataset.iloc[index_split_dataset:,:]
trainset.shape, testset.shape

((110937, 323), (72711, 322))

## Learning - Baseline models

In [12]:
drop_list = ['match_date','league_name', 'home_team_name', 'away_team_name'] +\
[f'home_team_history_match_date_{i}' for i in range(1,11)] +\
[f'away_team_history_match_date_{i}' for i in range(1,11)]
X = trainset.drop(drop_list+['target'], axis=1)
y = trainset['target']
final_X = testset.drop(drop_list, axis=1)
X.shape, final_X.shape

((110937, 298), (72711, 298))

In [13]:
# LogisticRegression
LogisticRegression_model = LogisticRegression(random_state=RAND)
LogisticRegression_score = cross_val_score(LogisticRegression_model, X, y,scoring='neg_log_loss', cv=5)
print('LogisticRegression_score: ', LogisticRegression_score.mean()*-1)

LogisticRegression_score:  1.083867026730588


In [16]:
# LGBMClassifier
ligthgbmc_model = LGBMClassifier(random_state=RAND)
ligthgbmc_score = cross_val_score(ligthgbmc_model, X, y,scoring='neg_log_loss', cv=5)
print('ligthgbmc_score: ', ligthgbmc_score.mean()*-1)

ligthgbmc_score:  1.0164546954448728


In [18]:
# CatBoostClassifier
cbc_model = CatBoostClassifier(random_state=RAND, verbose=False)
cbc_score = cross_val_score(cbc_model, X, y, cv=5,scoring='neg_log_loss')
print('cbc_score: ', cbc_score.mean()*-1)

cbc_score:  1.0119843126024233


In [None]:
# XGBClassifier
xgb_model = XGBClassifier(random_state=RAND, verbose=False)
xgb_score = cross_val_score(xgb_model, X, y, cv=5,scoring='neg_log_loss')
print('xgb_score: ', xgb_score.mean()*-1)

#### The best result with CatBoost - 1.01198

##  Searching hyperparameters with Optuna

In [65]:
drop_list = ['match_date','league_name', 'home_team_name', 'away_team_name'] +\
[f'home_team_history_match_date_{i}' for i in range(1,11)] +\
[f'away_team_history_match_date_{i}' for i in range(1,11)]

In [66]:
X = trainset.drop(drop_list+['target'], axis=1)
y = trainset['target']
final_X = testset.drop(drop_list, axis=1)
X.shape, final_X.shape

((110937, 298), (72711, 298))

### CatBoostClassifier

In [68]:
def objective(trial, data=X, target=y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001,0.01),
        'n_estimators': trial.suggest_int("n_estimators", 3000, 8000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "l2_leaf_reg": trial.suggest_int("max_depth", 3, 10),
        #"subsample": trial.suggest_float("subsample", 0.5, 0.99, step = 0.09),
        #'od_wait': trial.suggest_int('od_wait', 10, 50, step=8),
        #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 0.99, step =0.09),
        #"random_strength": trial.suggest_int("random_strength", 1, 10, step=1)
    }
    cv = KFold(n_splits=5, shuffle=True, random_state=RAND)
    
    cv_predicts = np.empty(5)
    
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = CatBoostClassifier(**param_grid)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  early_stopping_rounds=100,
                  verbose=False)
        preds = model.predict_proba(X_test)        
        cv_predicts[idx] = log_loss(y_test, preds)

    return np.mean(cv_predicts)

In [69]:
study = optuna.create_study(direction="minimize", study_name="CTB")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=4, show_progress_bar=False, n_jobs=4)

[32m[I 2022-04-10 18:16:45,495][0m A new study created in memory with name: CTB[0m
Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.[32m[I 2022-04-10 20:04:45,125][0m Trial 3 finished with value: 0.9950484081271217 and parameters: {'learning_rate': 0.0037116868923497753, 'n_estimators': 4307, 'max_depth': 4}. Best is trial 3 with value: 0.9950484081271217.[0m
[32m[I 2022-04-11 08:54:26,923][0m Trial 1 finished with value: 0.9924313867129003 and parameters: {'learning_rate': 0.004430666526778404, 'n_estimators': 4305, 'max_depth': 8}. Best is trial 1 with value: 0.9924313867129003.[0m
[32m[I 2022-04-11 10:58:53,070][0m Trial 0 finished with value: 0.9918752928545521 and parameters: {'learning_rate': 0.004690749007183885, 'n_estimators': 5

In [70]:
study.best_params

{'learning_rate': 0.004690749007183885, 'n_estimators': 5114, 'max_depth': 9}

In [78]:
study.best_value

0.9918752928545521

In [71]:
best_params_cb = {'learning_rate': 0.004690749007183885, 'n_estimators': 5114, 'max_depth': 9}

#### We got a good result on cross-validation with cv=5 - 0.991875

### LGBMClassifier

In [44]:
def objective(trial, data=X, target=y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001,0.01),
        'n_estimators': trial.suggest_int("n_estimators", 3000, 8000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        #"subsample": trial.suggest_float("subsample", 0.5, 0.99, step = 0.09),
        #'od_wait': trial.suggest_int('od_wait', 10, 50, step=8),
        #"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 0.99, step =0.09),
        #"random_strength": trial.suggest_int("random_strength", 1, 10, step=1)
    }
    cv = KFold(n_splits=5, shuffle=True, random_state=8)
    
    cv_predicts = np.empty(5)
    
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        pruning_callback = optuna.integration.LightGBMPruningCallback(
            trial, "log_loss")
        model = LGBMClassifier(**param_grid)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  #eval_metric="log_loss",
                  early_stopping_rounds=100,
                  #callbacks=[pruning_callback],
                  verbose=-1)
        preds = model.predict_proba(X_test)        
        cv_predicts[idx] = log_loss(y_test, preds)

    return np.mean(cv_predicts)

In [None]:
study = optuna.create_study(direction="minimize", study_name="LGB")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=7, show_progress_bar=False, n_jobs=1)

In [46]:
study.best_value

0.9965805122768139

In [47]:
study.best_params

{'learning_rate': 0.009098879145838661, 'n_estimators': 5524, 'max_depth': 9}

In [57]:
best_params_lgb = {'learning_rate': 0.009098879145838661, 'n_estimators': 5524, 'max_depth': 9}

#### Best score on cv=5 - 0.9965805122768139

## Submission

In [54]:
path_to_submission = 'final_submissions/'
def to_submission(test_id,predicts, name_file):
    df_submit = pd.DataFrame(data = predicts, columns=['away','home','draw'])
    df_submit['id'] = test_id
    df_submit = df_submit.set_index('id')
    return df_submit.to_csv(path_to_submission+name_file)

In [None]:
ct_clf = CatBoostClassifier(**best_params_cb)
ct_clf.fit(X,y)
predicts_ct = ct_clf.predict_proba(final_X)

In [73]:
to_submission(final_X.index, predicts_ct, 'submission_cb_2.csv')

#### And the final score on the public leaderboard is 0.98834 (1-st place) on 13 April 2022