# Regression
Predict the final goals scored per each team.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
pd.set_option('display.max_columns', 999)

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.estimator import inputs
from tensorflow import feature_column
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras.models import load_model


import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

from  IPython import display
from matplotlib import pyplot as plt

import numpy as np
import math

import pathlib
import shutil
import tempfile

tf.compat.v1.disable_eager_execution()

In [3]:
#pip install git+https://github.com/tensorflow/docs

## Data Preprocessing
    1. Normalization
    2. Data encoding: The goals [0,10] => [-1,1]. If Goal > 10 => 1

In [4]:
df02 = pd.read_csv('https://raw.githubusercontent.com/Khaledjallouli/project/master/data/data_regression_goals/sliding02_goals.csv', sep=',', index_col=0)

In [5]:
df02

Unnamed: 0,home_team_goal,away_team_goal,odds-home,odds-draw,odds-away,home-wins,home-draws,home-losses,home-goals,home-opposition-goals,home-shots,home-shots_on_target,home-opposition_shots,home-opposition_shots_on_target,away-wins,away-draws,away-losses,away-goals,away-opposition-goals,away-shots,away-shots_on_target,away-opposition_shots,away-opposition_shots_on_target
0,2,1,3.50,3.30,2.10,1,3,6,11,16,137,67,117,53,8,2,0,15,6,161,78,72,30
1,2,2,2.50,3.30,2.88,3,1,6,8,16,134,64,151,77,3,3,4,11,18,104,44,87,36
2,1,2,1.91,3.40,4.20,4,2,4,10,15,120,58,124,56,2,2,6,11,15,134,59,100,46
3,2,1,3.25,3.25,2.30,5,2,3,22,12,177,82,74,37,6,3,1,19,8,169,95,113,58
4,3,0,1.20,6.00,19.00,7,2,1,15,8,161,72,74,31,3,2,5,10,17,143,69,134,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7028,2,1,5.00,3.80,1.70,4,2,4,10,12,80,43,117,60,2,4,4,10,21,119,58,112,65
7029,4,2,2.00,3.60,3.70,3,2,5,9,21,89,47,92,46,1,4,5,7,21,104,55,63,26
7030,4,1,1.80,3.75,4.50,3,3,4,8,8,122,59,92,50,5,2,3,15,9,84,43,118,56
7031,3,1,1.33,5.25,9.00,2,3,5,12,14,124,62,99,50,3,3,4,11,17,103,53,122,54


In [6]:
byhomegoal = df02.groupby('home_team_goal')
byhomegoal.size().sort_values(ascending=False)

home_team_goal
1     2251
2     1732
0     1589
3      885
4      378
5      132
6       43
7       13
8        7
9        2
10       1
dtype: int64

In [7]:
byawaygoal = df02.groupby('away_team_goal')
byawaygoal.size().sort_values(ascending=False)

away_team_goal
1    2381
0    2362
2    1401
3     613
4     193
5      52
6      23
8       5
7       2
9       1
dtype: int64

### =>The two previous cells show that we can keep 6 classes: [0.5].


In [8]:
def get_X_and_y(dataframe):
    X = dataframe.drop(columns=['home_team_goal','away_team_goal']).values
    y = dataframe[['home_team_goal','away_team_goal']].values
    return X,y

In [9]:
def encode(i):
    switcher = {
        0: -1,
        1: -0.6,
        2: -0.2,
        3: 0.2,
        4: 0.6,
        5: 1,
    }
    # 1 be assigned as default value of passed argument (if goals > 5)
    return switcher.get(i, 1)

def decode(i):
    switcher = {
        -1: 0,
        -0.6: 1,
        -0.2: 2,
        0.2: 3,
        0.6: 4,
        1: 5,
    }
    return switcher.get(i, "ERROR! Use Encode Before!")

In [10]:
def normalize(dataframe):
    column_names_to_not_normalize = ['home_team_goal','away_team_goal']
    column_names_to_normalize = [x for x in list(dataframe) if x not in column_names_to_not_normalize ]
    x = dataframe[column_names_to_normalize].values
    x_scaled = preprocessing.normalize(x)
    df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = dataframe.index)
    dataframe[column_names_to_normalize] = df_temp
    
    dataframe['home_team_goal'] = dataframe.apply(lambda row: encode(row['home_team_goal']), axis=1)
    dataframe['away_team_goal'] = dataframe.apply(lambda row: encode(row['away_team_goal']), axis=1)
    
    return dataframe

In [53]:
n02 = normalize(df02)

train02, test02 = train_test_split(n02, test_size=0.1, shuffle=False)
print(len(train02), 'train examples')
print(len(test02), 'test examples')

labels = n02.iloc[:,0:2]
train = n02.iloc[:,2:]


train_X02,train_y02 = get_X_and_y(train02)
labels_train = n02.iloc[:,0:2]

test_X02,test_y02 = get_X_and_y(test02)

6329 train examples
704 test examples


In [54]:
train

Unnamed: 0,odds-home,odds-draw,odds-away,home-wins,home-draws,home-losses,home-goals,home-opposition-goals,home-shots,home-shots_on_target,home-opposition_shots,home-opposition_shots_on_target,away-wins,away-draws,away-losses,away-goals,away-opposition-goals,away-shots,away-shots_on_target,away-opposition_shots,away-opposition_shots_on_target
0,0.012482,0.011769,0.007489,0.003566,0.010699,0.021398,0.039230,0.057061,0.488587,0.238944,0.417260,0.189015,0.028531,0.007133,0.000000,0.053495,0.021398,0.574178,0.278173,0.256775,0.106990
1,0.009236,0.012191,0.010640,0.011083,0.003694,0.022166,0.029555,0.059110,0.495044,0.236439,0.557848,0.284465,0.011083,0.011083,0.014777,0.040638,0.066498,0.384213,0.162552,0.321409,0.132997
2,0.007188,0.012795,0.015805,0.015053,0.007526,0.015053,0.037632,0.056448,0.451585,0.218266,0.466638,0.210740,0.007526,0.007526,0.022579,0.041395,0.056448,0.504270,0.222029,0.376321,0.173108
3,0.010289,0.010289,0.007281,0.015829,0.006332,0.009497,0.069647,0.037989,0.560339,0.259592,0.234266,0.117133,0.018995,0.009497,0.003166,0.060149,0.025326,0.535013,0.300747,0.357730,0.183614
4,0.004077,0.020384,0.064551,0.023782,0.006795,0.003397,0.050961,0.027179,0.546982,0.244613,0.251408,0.105320,0.010192,0.006795,0.016987,0.033974,0.057756,0.485829,0.234421,0.455252,0.234421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7028,0.020289,0.015420,0.006898,0.016231,0.008116,0.016231,0.040578,0.048694,0.324626,0.174487,0.474766,0.243470,0.008116,0.016231,0.016231,0.040578,0.085214,0.482881,0.235354,0.454477,0.263759
7029,0.009960,0.017929,0.018427,0.014941,0.009960,0.024901,0.044822,0.104584,0.443238,0.234070,0.458179,0.229089,0.004980,0.019921,0.024901,0.034861,0.104584,0.517941,0.273911,0.313753,0.129485
7030,0.007617,0.015870,0.019044,0.012696,0.012696,0.016928,0.033855,0.033855,0.516293,0.249683,0.389336,0.211596,0.021160,0.008464,0.012696,0.063479,0.038087,0.355481,0.181972,0.499366,0.236987
7031,0.005271,0.020806,0.035667,0.007926,0.011889,0.019815,0.047556,0.055483,0.491416,0.245708,0.392341,0.198152,0.011889,0.011889,0.015852,0.043593,0.067372,0.408193,0.210041,0.483490,0.214004


In [40]:
n02

Unnamed: 0,home_team_goal,away_team_goal,odds-home,odds-draw,odds-away,home-wins,home-draws,home-losses,home-goals,home-opposition-goals,home-shots,home-shots_on_target,home-opposition_shots,home-opposition_shots_on_target,away-wins,away-draws,away-losses,away-goals,away-opposition-goals,away-shots,away-shots_on_target,away-opposition_shots,away-opposition_shots_on_target
0,1.0,1.0,0.012482,0.011769,0.007489,0.003566,0.010699,0.021398,0.039230,0.057061,0.488587,0.238944,0.417260,0.189015,0.028531,0.007133,0.000000,0.053495,0.021398,0.574178,0.278173,0.256775,0.106990
1,1.0,1.0,0.009236,0.012191,0.010640,0.011083,0.003694,0.022166,0.029555,0.059110,0.495044,0.236439,0.557848,0.284465,0.011083,0.011083,0.014777,0.040638,0.066498,0.384213,0.162552,0.321409,0.132997
2,1.0,1.0,0.007188,0.012795,0.015805,0.015053,0.007526,0.015053,0.037632,0.056448,0.451585,0.218266,0.466638,0.210740,0.007526,0.007526,0.022579,0.041395,0.056448,0.504270,0.222029,0.376321,0.173108
3,1.0,1.0,0.010289,0.010289,0.007281,0.015829,0.006332,0.009497,0.069647,0.037989,0.560339,0.259592,0.234266,0.117133,0.018995,0.009497,0.003166,0.060149,0.025326,0.535013,0.300747,0.357730,0.183614
4,1.0,1.0,0.004077,0.020384,0.064551,0.023782,0.006795,0.003397,0.050961,0.027179,0.546982,0.244613,0.251408,0.105320,0.010192,0.006795,0.016987,0.033974,0.057756,0.485829,0.234421,0.455252,0.234421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7028,1.0,1.0,0.020289,0.015420,0.006898,0.016231,0.008116,0.016231,0.040578,0.048694,0.324626,0.174487,0.474766,0.243470,0.008116,0.016231,0.016231,0.040578,0.085214,0.482881,0.235354,0.454477,0.263759
7029,1.0,1.0,0.009960,0.017929,0.018427,0.014941,0.009960,0.024901,0.044822,0.104584,0.443238,0.234070,0.458179,0.229089,0.004980,0.019921,0.024901,0.034861,0.104584,0.517941,0.273911,0.313753,0.129485
7030,1.0,1.0,0.007617,0.015870,0.019044,0.012696,0.012696,0.016928,0.033855,0.033855,0.516293,0.249683,0.389336,0.211596,0.021160,0.008464,0.012696,0.063479,0.038087,0.355481,0.181972,0.499366,0.236987
7031,1.0,1.0,0.005271,0.020806,0.035667,0.007926,0.011889,0.019815,0.047556,0.055483,0.491416,0.245708,0.392341,0.198152,0.011889,0.011889,0.015852,0.043593,0.067372,0.408193,0.210041,0.483490,0.214004


In [41]:
trainX

Unnamed: 0,home_team_goal,away_team_goal,odds-home,odds-draw,odds-away,home-wins,home-draws,home-losses,home-goals,home-opposition-goals,home-shots,home-shots_on_target,home-opposition_shots,home-opposition_shots_on_target,away-wins,away-draws,away-losses,away-goals,away-opposition-goals,away-shots,away-shots_on_target,away-opposition_shots,away-opposition_shots_on_target
5457,1.0,-0.6,0.004130,0.032604,0.054340,0.025358,0.003623,0.007245,0.061585,0.039849,0.590489,0.329660,0.322414,0.144905,0.014491,0.007245,0.014491,0.047094,0.054340,0.481810,0.213735,0.304301,0.177509
5843,-0.6,-0.6,0.004058,0.033202,0.077471,0.025824,0.007378,0.003689,0.088538,0.044269,0.542298,0.295128,0.391045,0.199212,0.000000,0.018446,0.018446,0.022135,0.081160,0.332019,0.184455,0.453760,0.225035
4781,-0.6,-0.6,0.025919,0.017266,0.006260,0.003988,0.011963,0.023925,0.027913,0.063801,0.386791,0.187414,0.470529,0.231277,0.035888,0.000000,0.003988,0.099688,0.027913,0.426666,0.243239,0.470529,0.243239
4122,-0.6,-0.6,0.009578,0.013742,0.012909,0.012493,0.004164,0.024986,0.049971,0.087450,0.387279,0.199886,0.458072,0.212379,0.024986,0.000000,0.016657,0.054136,0.041643,0.458072,0.237364,0.462236,0.241529
3095,-0.6,-0.6,0.004987,0.017812,0.032062,0.024937,0.000000,0.010687,0.092624,0.035624,0.502305,0.238684,0.334870,0.178122,0.014250,0.003562,0.017812,0.049874,0.053437,0.413243,0.170997,0.516554,0.249371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,-0.6,1.0,0.052418,0.030241,0.004718,0.004032,0.016129,0.020161,0.028225,0.076611,0.298381,0.112901,0.463700,0.205641,0.024193,0.000000,0.016129,0.133062,0.064515,0.536279,0.298381,0.419346,0.233866
5191,-0.6,-0.6,0.015896,0.014382,0.006926,0.011354,0.018923,0.007569,0.045416,0.037847,0.378466,0.155171,0.465514,0.261142,0.018923,0.015139,0.003785,0.071909,0.041631,0.480652,0.242219,0.439021,0.230865
5226,-0.6,-0.6,0.005218,0.020596,0.035308,0.027461,0.000000,0.011769,0.066692,0.023538,0.533537,0.278538,0.282461,0.141230,0.027461,0.003923,0.007846,0.070615,0.027461,0.564922,0.302076,0.302076,0.149077
5390,-0.6,-0.6,0.014089,0.012774,0.007702,0.003757,0.022542,0.011271,0.045084,0.056356,0.405760,0.191609,0.548528,0.259236,0.018785,0.003757,0.015028,0.052599,0.056356,0.447088,0.259236,0.345648,0.176581


In [42]:
# verif the classes
byhomegoal = df02.groupby('home_team_goal')
byhomegoal.size().sort_values(ascending=False)

home_team_goal
 1.0    6835
-0.6     198
dtype: int64

In [43]:
byawaygoal = df02.groupby('away_team_goal')
byawaygoal.size().sort_values(ascending=False)

away_team_goal
 1.0    6950
-0.6      83
dtype: int64

In [44]:
def round_pred(val):
    if val <=1 and val > 0.67:
        return 1
    elif val <=0.67 and val >0.33:
        return 0.60
    elif val <= 0.33 and val > 0:
        return 0.20
    elif val <= 0 and val > -0.33:
        return -0.20
    elif val<=-0.33 and val> -0.67:
        return -0.60
    else:
        return -1

### Build the model

https://www.tensorflow.org/tutorials/keras/regression
The mean_squared_error (mse) and mean_absolute_error (mae) are our loss functions – i.e. an estimate of how accurate the neural network is in predicting the test data. We can see that with the validation_split set to 0.2, 80% of the training data is used to test the model, while the remaining 20% is used for testing purposes.

In [67]:
x = train
y = labels.iloc[:,0]
y1 = labels.iloc[:,0]
y2 = labels.iloc[:,1]

inputs = tf.keras.layers.Input(shape=(21,))
d = tf.keras.layers.Dense(6, name='out')
d = tf.keras.layers.Dense(100, activation='relu')
output_1 = d(inputs)
output_2 = d(inputs)
model = tf.keras.models.Model(
   inputs=inputs, outputs=[output_1, output_2])
model.compile(optimizer="Adam", loss='sparse_categorical_crossentropy', metrics=["mae", "acc"])
history = model.fit(x, (y1, y2), epochs=5)
model.metrics_names

Train on 7033 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


['loss',
 'dense_15_loss',
 'dense_15_1_loss',
 'dense_15_mae',
 'dense_15_acc',
 'dense_15_1_mae',
 'dense_15_1_acc']