In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from scipy import optimize as opt

In [2]:
matches = pd.read_csv("upcoming.csv", index_col=0)

## Cleaning and getting data ready to use.

In [3]:
#Encoding label we make result being a numeric value, which we can predict later. Instead of W, D, L.
yle = LabelEncoder()
matches['result'] = yle.fit_transform(matches['result'])

In [4]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,referee,match report,notes,sh,sot,dist,g-xg,npxg/sh,season,team
1,2022-08-14,22:00,La Liga,Matchweek 1,Sun,Away,2,2.0,1.0,Almería,...,Juan Martínez,Match Report,,29.0,15.0,17.9,-0.4,0.09,2023,Real Madrid
2,2022-08-20,22:00,La Liga,Matchweek 2,Sat,Away,2,4.0,1.0,Celta Vigo,...,Jesús Gil,Match Report,,15.0,6.0,18.7,0.6,0.13,2023,Real Madrid
3,2022-08-28,22:00,La Liga,Matchweek 3,Sun,Away,2,3.0,1.0,Espanyol,...,Mario Melero,Match Report,,19.0,6.0,15.6,1.4,0.08,2023,Real Madrid
4,2022-09-03,16:15,La Liga,Matchweek 4,Sat,Home,2,2.0,1.0,Betis,...,José Sánchez,Match Report,,22.0,5.0,17.2,-0.6,0.12,2023,Real Madrid
6,2022-09-11,14:00,La Liga,Matchweek 5,Sun,Home,2,4.0,1.0,Mallorca,...,Jorge Figueroa,Match Report,,22.0,7.0,21.5,2.4,0.07,2023,Real Madrid


In [5]:
#Using pandas sweets.
matches["date"] = pd.to_datetime(matches["date"])

#Creating numeric values for Home and Away, so that machine can learn from it.
matches['venue_c'] = matches['venue'].astype('category').cat.codes

#Creating unique code for each opponent squad.
matches['opp_c'] = matches['opponent'].astype('category').cat.codes

#Creating a normalized hour time.
matches['hour'] = matches['time'].str.replace(":.+", "", regex=True).astype("int")

#Monday = 0, Tuesday = 1, etc.
matches['day_c'] = matches['date'].dt.dayofweek

matches = matches.drop(columns=['round', 'day', 'comp'])

In [6]:
matches = matches[['date', 'result', 'venue', 'gf', 'ga', 'xg', 'xga',
       'poss', 'sh', 'sot', 'dist', 'g-xg', 'team', 'opponent', 'opp_c']]

#Making all teams names correct.
matches = matches.replace("Atlético Madrid", "Atletico Madrid")
matches = matches.replace("Alavés", "Alaves")
matches = matches.replace("Leganés", "Leganes")
matches = matches.replace("Málaga", "Malaga")
matches = matches.replace("La Coruña", "Deportivo La Coruna")
matches = matches.replace("Atlético Madrid", "Atletico Madrid")
matches = matches.replace("Real Betis", "Betis")


In [7]:
is_home = matches['venue'] == "Home"
home = matches[is_home]
away = matches[~is_home]

In [8]:
#Setting up our data, so we can add opponents value at the current time to our DataFrame.
home = home.merge(
    away[["date", "opponent", "team", "xg", "xga", "sh", "poss", "g-xg"]],
    left_on=["date", "team", "opponent"],
    right_on=["date", "opponent", "team"],
    how="outer",
    suffixes=("", "_opp"),
).drop(columns=["opponent_opp", "team_opp"])

In [9]:
home = home.dropna()

In [10]:
matches = home

In [12]:
matches.head()

Unnamed: 0,date,result,venue,gf,ga,xg,xga,poss,sh,sot,dist,g-xg,team,opponent,opp_c,xg_opp,xga_opp,sh_opp,poss_opp,g-xg_opp
0,2022-09-03,2.0,Home,2.0,1.0,2.6,0.5,49.0,22.0,5.0,17.2,-0.6,Real Madrid,Betis,5.0,0.5,2.6,8.0,51.0,0.5
1,2022-09-11,2.0,Home,4.0,1.0,1.6,0.4,68.0,22.0,7.0,21.5,2.4,Real Madrid,Mallorca,19.0,0.4,1.6,5.0,32.0,0.6
2,2022-10-02,2.0,Home,4.0,1.0,1.6,0.4,68.0,22.0,7.0,21.5,2.4,Real Madrid,Osasuna,21.0,2.1,0.3,15.0,39.0,-1.1
3,2022-10-16,2.0,Home,4.0,1.0,1.6,0.4,68.0,22.0,7.0,21.5,2.4,Real Madrid,Barcelona,4.0,3.0,0.3,16.0,69.0,1.0
4,2022-10-23,2.0,Home,4.0,1.0,1.6,0.4,68.0,22.0,7.0,21.5,2.4,Real Madrid,Sevilla,25.0,1.8,2.4,11.0,41.0,1.2


In [13]:
grouped_matches = matches.groupby('team')

In [14]:
group = grouped_matches.get_group("Real Madrid")

In [15]:
#Short function to get rolling averages as our predictors from last 5 matches.
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(6, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [16]:
#cols = ['poss', 'sh', 'xg', 'xga', 'g-xg', 'poss_opp', 'sh_opp', 'xg_opp', 'xga_opp', 'g-xg_opp']
cols = ['xg', 'g-xg', 'xg_opp', 'g-xg_opp']
new_cols = [f"{c}R" for c in cols]

In [17]:
#Getting the rolling averages.
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

In [18]:
matches_rolling.head()

Unnamed: 0,date,result,venue,gf,ga,xg,xga,poss,sh,sot,...,opp_c,xg_opp,xga_opp,sh_opp,poss_opp,g-xg_opp,xgR,g-xgR,xg_oppR,g-xg_oppR
0,2017-11-25,1.0,Home,1.0,2.0,0.4,0.7,44.0,6.0,2.0,...,8.0,0.7,0.4,12.0,56.0,1.3,0.733333,-0.233333,1.666667,0.166667
1,2017-12-08,2.0,Home,2.0,0.0,3.4,0.3,36.0,23.0,9.0,...,16.0,0.3,3.4,6.0,64.0,-0.3,0.666667,0.0,1.383333,0.45
2,2017-12-21,2.0,Home,1.0,0.0,1.7,0.3,41.0,13.0,5.0,...,20.0,0.3,1.7,10.0,59.0,-0.3,1.166667,-0.166667,1.116667,0.216667
3,2018-01-14,2.0,Home,1.0,0.0,0.4,0.6,39.0,11.0,4.0,...,25.0,0.6,0.4,14.0,61.0,-0.6,1.333333,-0.333333,0.65,0.35
4,2018-01-21,0.0,Home,2.0,2.0,2.5,1.4,48.0,14.0,6.0,...,17.0,1.4,2.5,9.0,52.0,0.6,1.316667,-0.15,0.633333,0.033333


In [19]:
df = matches_rolling
df = df.sort_values('date')

In [21]:
df.head()

Unnamed: 0,date,result,venue,gf,ga,xg,xga,poss,sh,sot,...,opp_c,xg_opp,xga_opp,sh_opp,poss_opp,g-xg_opp,xgR,g-xgR,xg_oppR,g-xg_oppR
1119,2017-11-05,1.0,Home,1.0,2.0,0.6,1.1,49.0,7.0,4.0,...,12.0,1.1,0.6,10.0,51.0,0.9,1.25,0.083333,1.233333,-0.233333
926,2017-11-17,0.0,Home,1.0,1.0,1.2,1.3,54.0,14.0,4.0,...,24.0,1.3,1.2,14.0,46.0,-0.3,1.616667,-0.616667,1.116667,0.05
503,2017-11-24,2.0,Home,1.0,0.0,1.6,1.2,58.0,13.0,6.0,...,17.0,1.2,1.6,10.0,42.0,-1.2,1.483333,0.183333,1.3,0.2
1120,2017-11-25,1.0,Home,0.0,5.0,1.3,3.6,46.0,17.0,2.0,...,3.0,3.6,1.3,17.0,54.0,0.4,1.133333,0.2,1.366667,-0.033333
1381,2017-11-25,2.0,Home,3.0,2.0,2.4,0.3,57.0,15.0,8.0,...,20.0,0.3,2.4,9.0,43.0,1.7,2.383333,-0.716667,0.716667,-0.05


## Now let's set up our model, train, test and see the results.

In [22]:
#Setting up our neural network model with 4 input layers, 8 hidden layers and 3 output layers (Win, Draw, Lose). 

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(4,)),
    keras.layers.Dense(8, activation="relu"),
    keras.layers.Dense(3, activation="softmax")
])

In [23]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [24]:
#Setting up training and testing slices.
train = df[df['date'] <= '2021-01-01']
test = df[df['date'] > '2021-01-01']

In [25]:
model.fit(train[new_cols], train['result'], epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1ebfae955e0>

In [26]:
test_loss, test_acc = model.evaluate(test[new_cols], test['result'])



In [27]:
prediction = model.predict(test[new_cols])



In [28]:
prediction[:5]     #Draw, Away, Home   <- predictions are in that order.

array([[0.25943458, 0.25393012, 0.48663527],
       [0.23801433, 0.24985756, 0.5121281 ],
       [0.29886138, 0.29283318, 0.40830547],
       [0.30839875, 0.28489426, 0.40670702],
       [0.29402557, 0.27718043, 0.42879397]], dtype=float32)

In [29]:
#Adding the probabilities to our sweet DataFrame.
test['home'] = prediction[:,2]
test['draw'] = prediction[:,0]
test['away'] = prediction[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['home'] = prediction[:,2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['draw'] = prediction[:,0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['away'] = prediction[:,1]


In [30]:
test = test[['date', 'team', 'opponent', 'home', 'draw', 'away']].round(2)

In [32]:
#Now we have the probabilities that our model is set for each even to happen in a certain match.
test

Unnamed: 0,date,team,opponent,home,draw,away
1923,2021-01-02,Villarreal,Levante,0.49,0.26,0.25
1438,2021-01-02,Real Madrid,Celta Vigo,0.51,0.24,0.25
456,2021-01-02,Betis,Sevilla,0.41,0.30,0.29
881,2021-01-02,Getafe,Valladolid,0.41,0.31,0.28
1043,2021-01-03,Huesca,Barcelona,0.43,0.29,0.28
...,...,...,...,...,...,...
1484,2023-06-04,Real Madrid,Athletic Club,0.67,0.17,0.16
1588,2023-06-04,Real Sociedad,Sevilla,0.44,0.31,0.25
606,2023-06-04,Celta Vigo,Barcelona,0.41,0.31,0.28
1863,2023-06-04,Valladolid,Getafe,0.46,0.30,0.24
