Let's try to describe the meaningful data of each match as a vector.

In [1]:
from fetcher import Fetcher
import numpy as np

fetch = Fetcher("data/database.sqlite")

In [2]:
player, attrs = fetch.get_player_data(505942)
attrs[0].keys()

dict_keys(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating', 'potential', 'preferred_foot', 'attacking_work_rate', 'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes'])

In [3]:
import datetime

# is date a before date b?
def is_before (a, b):
    a_dt = datetime.datetime.strptime(a, '%Y-%m-%d %H:%M:%S')
    b_dt = datetime.datetime.strptime(b, '%Y-%m-%d %H:%M:%S')
    return a_dt <= b_dt

In [4]:
seen_players = {}

# Summarize the player (at a given point in time)
# to a single vector
def player_vector (api_id, match_date):
    player_cols = ['height', 'weight']
    # Manually removed some that look suspicious or are non-numerical
    attr_cols = ['overall_rating', 'potential',
             'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 
             'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 
             'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 
             'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 
             'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle']
    
    if not api_id in seen_players:
        seen_players[api_id] = fetch.get_player_data(api_id)
    player, attrs = seen_players[api_id]
    
    if len(attrs) == 0:
        raise ValueError
    
    # let's find the most recent dict before match_date
    cur_attrs = None
    for d in attrs:
        if is_before(d["date"], match_date):
            if cur_attrs == None or is_before(cur_attrs["date"], d["date"]):
                cur_attrs = d
                
    # something is better than nothing
    if cur_attrs == None:
        cur_attrs = attrs[0]
                
    return [(player[col] if (col in player and player[col] != None) else 0) for col in player_cols] + \
           [(cur_attrs[col] if (col in cur_attrs and cur_attrs[col] != None) else 0) for col in attr_cols]

In [5]:
player_vector(148336, "2009-04-10 00:00:00")

[175.26,
 152,
 56,
 67,
 55,
 47,
 35,
 62,
 53,
 60,
 53,
 53,
 57,
 62,
 72,
 70,
 75,
 57,
 52,
 55,
 57,
 59,
 47,
 54,
 37,
 44,
 47,
 57,
 56,
 26,
 30,
 27]

In [6]:
team, attrs = fetch.get_team_data(9987)
print(team.keys())
print(attrs[0].keys())

dict_keys(['id', 'team_api_id', 'team_fifa_api_id', 'team_long_name', 'team_short_name'])
dict_keys(['id', 'team_fifa_api_id', 'team_api_id', 'date', 'buildUpPlaySpeed', 'buildUpPlaySpeedClass', 'buildUpPlayDribbling', 'buildUpPlayDribblingClass', 'buildUpPlayPassing', 'buildUpPlayPassingClass', 'buildUpPlayPositioningClass', 'chanceCreationPassing', 'chanceCreationPassingClass', 'chanceCreationCrossing', 'chanceCreationCrossingClass', 'chanceCreationShooting', 'chanceCreationShootingClass', 'chanceCreationPositioningClass', 'defencePressure', 'defencePressureClass', 'defenceAggression', 'defenceAggressionClass', 'defenceTeamWidth', 'defenceTeamWidthClass', 'defenceDefenderLineClass'])


In [7]:
seen_teams = {}

# Summarize the team (at a given point in time)
# to a single vector
def team_vector (api_id, match_date):
    team_cols = []
    # Again, removed some that are irrelevant, non-numerical or have many missing values
    attr_cols = ['buildUpPlaySpeed', 'buildUpPlayPassing', 'chanceCreationPassing',
             'chanceCreationCrossing', 'chanceCreationShooting','defencePressure', 
             'defenceAggression', 'defenceTeamWidth']
    
    if not api_id in seen_teams:
        seen_teams[api_id] = fetch.get_team_data(api_id)
    team, attrs = seen_teams[api_id]
    
    if len(attrs) == 0:
        raise ValueError
    
    cur_attrs = None
    for d in attrs:
        if is_before(d["date"], match_date):
            if cur_attrs == None or is_before(cur_attrs["date"], d["date"]):
                cur_attrs = d
                
    if cur_attrs == None:
        cur_attrs = attrs[0]
        
    return [(team[col] if col in team and team[col] != None else 0) for col in team_cols] + \
           [(cur_attrs[col] if col in cur_attrs and cur_attrs[col] != None else 0) for col in attr_cols]

In [8]:
team_vector(9987, "2050-01-23 01:23:45")

[58, 38, 30, 69, 56, 36, 57, 70]

In [9]:
# Let's summarize "things known before the match" as a vector X
# and "the result" as an one-hot coded vector 
def match_vector (match_dict):
    team_list = ["home_team_api_id", "away_team_api_id"]
    player_list = ["home_player_1", "home_player_2", "home_player_3", "home_player_4",
               "home_player_5", "home_player_6", "home_player_7", "home_player_8",
               "home_player_9", "home_player_10", "home_player_11",
               "away_player_1", "away_player_2", "away_player_3", "away_player_4",
               "away_player_5", "away_player_6", "away_player_7", "away_player_8",
               "away_player_9", "away_player_10", "away_player_11"]
    
    home_goal = match_dict["home_team_goal"]
    away_goal = match_dict["away_team_goal"]
    if home_goal > away_goal:
        y = np.array([1, 0, 0])
    elif home_goal == away_goal:
        y = np.array([0, 1, 0])
    else:
        y = np.array([0, 0, 1])
    
    ans = []
    for team in team_list:
        ans += team_vector(match_dict[team], match_dict["date"])
    for player in player_list:
        ans += player_vector(match_dict[player], match_dict["date"])
    return (np.array(ans), y)

In [10]:
matches = fetch.get_all_matches()
cur_match = matches[10006]
print(cur_match["home_team_goal"])
print(cur_match["away_team_goal"])
print(cur_match["date"])
print(cur_match["home_player_9"])
# print(player_vector(cur_match["home_player_9"], cur_match["date"]))
vec = match_vector(cur_match)
print(vec)
print(vec[0].shape)

1
2
2015-12-05 00:00:00
243164
(array([ 68.  ,  57.  ,  69.  ,  66.  ,  50.  ,  66.  ,  46.  ,  55.  ,
        71.  ,  40.  ,  70.  ,  44.  ,  49.  ,  65.  ,  71.  ,  40.  ,
       193.04, 196.  ,  81.  ,  81.  ,  14.  ,   8.  ,  15.  ,  32.  ,
        12.  ,  13.  ,  12.  ,  13.  ,  31.  ,  20.  ,  42.  ,  44.  ,
        44.  ,  79.  ,  47.  ,  22.  ,  73.  ,  43.  ,  76.  ,  11.  ,
        38.  ,  22.  ,   7.  ,  37.  ,  23.  ,  10.  ,  13.  ,  14.  ,
       172.72, 161.  ,  80.  ,  80.  ,  80.  ,  73.  ,  42.  ,  79.  ,
        65.  ,  87.  ,  74.  ,  67.  ,  70.  ,  84.  ,  84.  ,  76.  ,
        86.  ,  81.  ,  85.  ,  78.  ,  69.  ,  75.  ,  57.  ,  76.  ,
        75.  ,  72.  ,  78.  ,  78.  ,  66.  ,  67.  ,  64.  ,  64.  ,
       198.12, 196.  ,  84.  ,  84.  ,  45.  ,  57.  ,  93.  ,  74.  ,
        60.  ,  60.  ,  52.  ,  76.  ,  70.  ,  69.  ,  67.  ,  77.  ,
        57.  ,  83.  ,  42.  ,  92.  ,  46.  ,  62.  ,  84.  ,  82.  ,
        73.  ,  93.  ,  53.  ,  59.  ,  60.  

We now can turn each match into a vector of data known right before the match, and the outcome. Let's build a neural network that learns to predict the outcome.

In [11]:
import keras

from keras.models import Sequential
from keras.layers import Input, Activation, Dense
from keras.optimizers import Adam

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [12]:
matches_train = matches[10000:25000]
xs = []
ys = []
for i, match in enumerate(matches_train):
    try:
        (x, y) = match_vector(match)
        xs.append(x)
        ys.append(y)
    except ValueError:
        pass
    
x_train = np.array(xs[:-20])
y_train = np.array(ys[:-20])
x_test = np.array(xs[-20:])
y_test = np.array(ys[-20:])

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(11184, 720)
(11184, 3)
(20, 720)
(20, 3)


In [18]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [19]:
x_train_std = ss.transform(x_train)
x_test_std = ss.transform(x_test)

In [20]:
model = Sequential()
model.add(Dense(32, input_shape=(720,)))
model.add(Activation("relu"))
model.add(Dense(3))
model.add(Activation("softmax"))

model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = 1e-3), metrics = ["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 32)                23072     
_________________________________________________________________
activation_3 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 99        
_________________________________________________________________
activation_4 (Activation)    (None, 3)                 0         
Total params: 23,171
Trainable params: 23,171
Non-trainable params: 0
_________________________________________________________________


In [21]:
history = model.fit(x_train_std, y_train, batch_size=50, epochs=10, validation_split = 0.1)

Train on 10065 samples, validate on 1119 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
y_pred = model.predict(x_test)
print(y_test)
print(y_pred)

[[0 1 0]
 [1 0 0]
 [1 0 0]
 [0 1 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [0 1 0]
 [1 0 0]
 [0 0 1]
 [0 0 1]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [0 0 1]
 [0 1 0]
 [1 0 0]
 [1 0 0]
 [0 0 1]]
[[0.0000000e+00 1.0000000e+00 1.3044409e-36]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00]
 [3.6182075e-18 1.0000000e+00 9.2212822e-33]
 [1.7147974e-38 1.0000000e+00 0.0000000e+00]
 [3.4212420e-37 1.0000000e+00 3.4924153e-18]
 [5.1278698e-28 1.0000000e+00 4.8993917e-23]
 [0.0000000e+00 1.0000000e+00 4.6731232e-31]
 [4.7632807e-37 7.1321469e-06 9.9999285e-01]
 [0.0000000e+00 1.0000000e+00 2.5260076e-29]
 [2.8897485e-36 1.0000000e+00 3.4390935e-35]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00]
 [6.0034620e-21 1.0000000e+00 1.2633368e-16]
 [0.0000000e+00 1.0000000e+00 7.2821755e-19]
 [1.1621419e-15 1.0000000e+00 7.5787879e-21]
 [2.1217663e-11 1.0000000e+00 1.2562794e-20]
 [5.6753185e-36 1.0000000e+00 4.6641744e-12]
 [0.0000000e+00 1.0000000e+00 7.8207024e-22]
 [1.4588803e-32 1.0000000e+00 0.0000000e+00]
 [1.33962