Let's try to describe the meaningful data of each match as a vector.

In [1]:
from fetcher import Fetcher
import numpy as np

fetch = Fetcher("data/database.sqlite")

In [2]:
player, attrs = fetch.get_player_data(505942)
attrs[0].keys()

dict_keys(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating', 'potential', 'preferred_foot', 'attacking_work_rate', 'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes'])

In [3]:
import datetime

# is date a before date b?
def is_before (a, b):
    a_dt = datetime.datetime.strptime(a, '%Y-%m-%d %H:%M:%S')
    b_dt = datetime.datetime.strptime(b, '%Y-%m-%d %H:%M:%S')
    return a_dt <= b_dt

In [38]:
seen_players = {}

# Summarize the player (at a given point in time)
# to a single vector
def player_vector (api_id, match_date):
    player_cols = ['height', 'weight']
    # Manually removed some that look suspicious or are non-numerical
    attr_cols = ['overall_rating', 'potential',
             'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 
             'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 
             'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 
             'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 
             'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle']
    
    if not api_id in seen_players:
        seen_players[api_id] = fetch.get_player_data(api_id)
    player, attrs = seen_players[api_id]
    
    if len(attrs) == 0:
        raise ValueError
    
    # let's find the most recent dict before match_date
    cur_attrs = None
    for d in attrs:
        if is_before(d["date"], match_date):
            if cur_attrs == None or is_before(cur_attrs["date"], d["date"]):
                cur_attrs = d
                
    # something is better than nothing
    if cur_attrs == None:
        cur_attrs = attrs[0]
                
    return [(player[col] if (col in player and player[col] != None) else 0) for col in player_cols] + \
           [(cur_attrs[col] if (col in cur_attrs and cur_attrs[col] != None) else 0) for col in attr_cols]

In [39]:
player_vector(148336, "2009-04-10 00:00:00")

[175.26,
 152,
 56,
 67,
 55,
 47,
 35,
 62,
 53,
 60,
 53,
 53,
 57,
 62,
 72,
 70,
 75,
 57,
 52,
 55,
 57,
 59,
 47,
 54,
 37,
 44,
 47,
 57,
 56,
 26,
 30,
 27]

In [40]:
team, attrs = fetch.get_team_data(9987)
print(team.keys())
print(attrs[0].keys())

dict_keys(['id', 'team_api_id', 'team_fifa_api_id', 'team_long_name', 'team_short_name'])
dict_keys(['id', 'team_fifa_api_id', 'team_api_id', 'date', 'buildUpPlaySpeed', 'buildUpPlaySpeedClass', 'buildUpPlayDribbling', 'buildUpPlayDribblingClass', 'buildUpPlayPassing', 'buildUpPlayPassingClass', 'buildUpPlayPositioningClass', 'chanceCreationPassing', 'chanceCreationPassingClass', 'chanceCreationCrossing', 'chanceCreationCrossingClass', 'chanceCreationShooting', 'chanceCreationShootingClass', 'chanceCreationPositioningClass', 'defencePressure', 'defencePressureClass', 'defenceAggression', 'defenceAggressionClass', 'defenceTeamWidth', 'defenceTeamWidthClass', 'defenceDefenderLineClass'])


In [41]:
seen_teams = {}

# Summarize the team (at a given point in time)
# to a single vector
def team_vector (api_id, match_date):
    team_cols = []
    # Again, removed some that are irrelevant, non-numerical or have many missing values
    attr_cols = ['buildUpPlaySpeed', 'buildUpPlayPassing', 'chanceCreationPassing',
             'chanceCreationCrossing', 'chanceCreationShooting','defencePressure', 
             'defenceAggression', 'defenceTeamWidth']
    
    if not api_id in seen_teams:
        seen_teams[api_id] = fetch.get_team_data(api_id)
    team, attrs = seen_teams[api_id]
    
    if len(attrs) == 0:
        raise ValueError
    
    cur_attrs = None
    for d in attrs:
        if is_before(d["date"], match_date):
            if cur_attrs == None or is_before(cur_attrs["date"], d["date"]):
                cur_attrs = d
                
    if cur_attrs == None:
        cur_attrs = attrs[0]
        
    return [(team[col] if col in team and team[col] != None else 0) for col in team_cols] + \
           [(cur_attrs[col] if col in cur_attrs and cur_attrs[col] != None else 0) for col in attr_cols]

In [42]:
team_vector(9987, "2050-01-23 01:23:45")

[58, 38, 30, 69, 56, 36, 57, 70]

In [43]:
# Let's summarize "things known before the match" as a vector X
# and "the result" as an one-hot coded vector 
def match_vector (match_dict):
    team_list = ["home_team_api_id", "away_team_api_id"]
    player_list = ["home_player_1", "home_player_2", "home_player_3", "home_player_4",
               "home_player_5", "home_player_6", "home_player_7", "home_player_8",
               "home_player_9", "home_player_10", "home_player_11",
               "away_player_1", "away_player_2", "away_player_3", "away_player_4",
               "away_player_5", "away_player_6", "away_player_7", "away_player_8",
               "away_player_9", "away_player_10", "away_player_11"]
    
    home_goal = match_dict["home_team_goal"]
    away_goal = match_dict["away_team_goal"]
    if home_goal > away_goal:
        y = np.array([1, 0, 0])
    elif home_goal == away_goal:
        y = np.array([0, 1, 0])
    else:
        y = np.array([0, 0, 1])
    
    ans = []
    for team in team_list:
        ans += team_vector(match_dict[team], match_dict["date"])
    for player in player_list:
        ans += player_vector(match_dict[player], match_dict["date"])
    return (np.array(ans), y)

In [33]:
matches = fetch.get_all_matches()
cur_match = matches[10000]
print(cur_match["home_team_goal"])
print(cur_match["date"])
print(cur_match["home_player_9"])
# print(player_vector(cur_match["home_player_9"], cur_match["date"]))
vec = match_vector(cur_match)
print(vec)
print(vec[0].shape)

4
2015-11-28 00:00:00
27314
(array([ 48.  ,  41.  ,  38.  ,  51.  ,  40.  ,  35.  ,  50.  ,  61.  ,
        55.  ,  46.  ,  47.  ,  53.  ,  51.  ,  37.  ,  48.  ,  49.  ,
       187.96, 183.  ,  80.  ,  82.  ,  15.  ,  10.  ,  13.  ,  25.  ,
        12.  ,  12.  ,  11.  ,  10.  ,  22.  ,  22.  ,  46.  ,  50.  ,
        43.  ,  80.  ,  41.  ,  21.  ,  69.  ,  32.  ,  55.  ,  14.  ,
        32.  ,  21.  ,  15.  ,  39.  ,  24.  ,  11.  ,  12.  ,  13.  ,
       175.26, 152.  ,  75.  ,  77.  ,  69.  ,  40.  ,  59.  ,  72.  ,
        61.  ,  68.  ,  59.  ,  50.  ,  64.  ,  72.  ,  75.  ,  74.  ,
        80.  ,  74.  ,  82.  ,  82.  ,  63.  ,  74.  ,  64.  ,  65.  ,
        76.  ,  77.  ,  49.  ,  60.  ,  51.  ,  77.  ,  78.  ,  80.  ,
       190.5 , 176.  ,  75.  ,  75.  ,  26.  ,  44.  ,  80.  ,  75.  ,
        42.  ,  63.  ,  39.  ,  41.  ,  73.  ,  65.  ,  65.  ,  72.  ,
        57.  ,  74.  ,  47.  ,  76.  ,  72.  ,  75.  ,  83.  ,  30.  ,
        74.  ,  72.  ,  34.  ,  37.  ,  51.  ,  

We now can turn each match into a vector of data known right before the match, and the outcome. Let's build a neural network that learns to predict the outcome.

In [34]:
import keras

In [69]:
from keras.models import Sequential
from keras.layers import Input, Activation, Dense
from keras.optimizers import Adam

model = Sequential()
model.add(Dense(32, input_shape=(720,)))
model.add(Activation("relu"))
model.add(Dense(32))
model.add(Activation("relu"))
model.add(Dense(3))
model.add(Activation("softmax"))

model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = 1e-2), metrics = ["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_28 (Dense)             (None, 32)                23072     
_________________________________________________________________
activation_27 (Activation)   (None, 32)                0         
_________________________________________________________________
dense_29 (Dense)             (None, 32)                1056      
_________________________________________________________________
activation_28 (Activation)   (None, 32)                0         
_________________________________________________________________
dense_30 (Dense)             (None, 3)                 99        
_________________________________________________________________
activation_29 (Activation)   (None, 3)                 0         
Total params: 24,227
Trainable params: 24,227
Non-trainable params: 0
_________________________________________________________________


In [47]:
matches_train = matches[10000:25000]
xs = []
ys = []
for i, match in enumerate(matches_train):
    try:
        (x, y) = match_vector(match)
        xs.append(x)
        ys.append(y)
    except ValueError:
        pass
    
x_train = np.array(xs[:-2000])
y_train = np.array(ys[:-2000])
x_test = np.array(xs[-2000:])
y_test = np.array(ys[-2000:])

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(9204, 720)
(9204, 3)
(2000, 720)
(2000, 3)


In [71]:
history = model.fit(x_train, y_train, batch_size=50, epochs=20, validation_data=(x_test, y_test))

Train on 9204 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
