This notebook started as a copy of the base_network notebook.
Here we add, to each match and for both teams, attributes like
"home win percentage of this team over the last X matches".

Let's try to describe the meaningful data of each match as a vector.

In [1]:
from fetcher import Fetcher
import numpy as np

fetch = Fetcher("data/database.sqlite")

In [2]:
player, attrs = fetch.get_player_data(505942)
attrs[0].keys()

dict_keys(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating', 'potential', 'preferred_foot', 'attacking_work_rate', 'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes'])

In [3]:
import datetime

# is date a before date b?
def is_before (a, b):
    a_dt = datetime.datetime.strptime(a, '%Y-%m-%d %H:%M:%S')
    b_dt = datetime.datetime.strptime(b, '%Y-%m-%d %H:%M:%S')
    return a_dt <= b_dt

In [4]:
seen_players = {}

# Summarize the player (at a given point in time)
# to a single vector
def player_vector (api_id, match_date):
    player_cols = ['height', 'weight']
    # Manually removed some that look suspicious or are non-numerical
    attr_cols = ['overall_rating', 'potential',
             'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 
             'dribbling', 'curve', 'free_kick_accuracy', 'long_passing', 'ball_control', 
             'acceleration', 'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 
             'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 
             'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle']
    
    if not api_id in seen_players:
        seen_players[api_id] = fetch.get_player_data(api_id)
    player, attrs = seen_players[api_id]
    
    if len(attrs) == 0:
        raise ValueError
    
    # let's find the most recent dict before match_date
    cur_attrs = None
    for d in attrs:
        if is_before(d["date"], match_date):
            if cur_attrs == None or is_before(cur_attrs["date"], d["date"]):
                cur_attrs = d
                
    # something is better than nothing
    if cur_attrs == None:
        cur_attrs = attrs[0]
                
    return [(player[col] if (col in player and player[col] != None) else 0) for col in player_cols] + \
           [(cur_attrs[col] if (col in cur_attrs and cur_attrs[col] != None) else 0) for col in attr_cols]

In [5]:
player_vector(148336, "2009-04-10 00:00:00")

[175.26,
 152,
 56,
 67,
 55,
 47,
 35,
 62,
 53,
 60,
 53,
 53,
 57,
 62,
 72,
 70,
 75,
 57,
 52,
 55,
 57,
 59,
 47,
 54,
 37,
 44,
 47,
 57,
 56,
 26,
 30,
 27]

In [6]:
team, attrs = fetch.get_team_data(9987)
print(team.keys())
print(attrs[0].keys())

dict_keys(['id', 'team_api_id', 'team_fifa_api_id', 'team_long_name', 'team_short_name'])
dict_keys(['id', 'team_fifa_api_id', 'team_api_id', 'date', 'buildUpPlaySpeed', 'buildUpPlaySpeedClass', 'buildUpPlayDribbling', 'buildUpPlayDribblingClass', 'buildUpPlayPassing', 'buildUpPlayPassingClass', 'buildUpPlayPositioningClass', 'chanceCreationPassing', 'chanceCreationPassingClass', 'chanceCreationCrossing', 'chanceCreationCrossingClass', 'chanceCreationShooting', 'chanceCreationShootingClass', 'chanceCreationPositioningClass', 'defencePressure', 'defencePressureClass', 'defenceAggression', 'defenceAggressionClass', 'defenceTeamWidth', 'defenceTeamWidthClass', 'defenceDefenderLineClass'])


In [7]:
seen_teams = {}

# Summarize the team (at a given point in time)
# to a single vector
def team_vector (api_id, match_date):
    team_cols = []
    # Again, removed some that are irrelevant, non-numerical or have many missing values
    attr_cols = ['buildUpPlaySpeed', 'buildUpPlayPassing', 'chanceCreationPassing',
             'chanceCreationCrossing', 'chanceCreationShooting','defencePressure', 
             'defenceAggression', 'defenceTeamWidth']
    
    if not api_id in seen_teams:
        seen_teams[api_id] = fetch.get_team_data(api_id)
    team, attrs = seen_teams[api_id]
    
    if len(attrs) == 0:
        raise ValueError
    
    cur_attrs = None
    for d in attrs:
        if is_before(d["date"], match_date):
            if cur_attrs == None or is_before(cur_attrs["date"], d["date"]):
                cur_attrs = d
                
    if cur_attrs == None:
        cur_attrs = attrs[0]
        
    return [(team[col] if col in team and team[col] != None else 0) for col in team_cols] + \
           [(cur_attrs[col] if col in cur_attrs and cur_attrs[col] != None else 0) for col in attr_cols]

In [8]:
team_vector(9987, "2050-01-23 01:23:45")

[58, 38, 30, 69, 56, 36, 57, 70]

Now comes the part that is different from the previous notebooks. Let's preprocess the list of all matches and calculate for each match these extra values.

In [9]:
matches = fetch.get_all_matches()
lookbacks = [5, 10, 20]

# home_matches[api_id] should be a list consisting of numbers
# 1, 0, -1: scores of the last matches in chronological orer
home_matches = {}
away_matches = {}

# match_avgs[match_api_id] is a list of 2 * len(lookbacks)
# containing the various averages
match_avgs = {}

for match_dict in matches:
    home_team = match_dict["home_team_api_id"]
    away_team = match_dict["away_team_api_id"]
    
    if home_team not in home_matches:
        home_matches[home_team] = []
    if away_team not in away_matches:
        away_matches[away_team] = []
    
    cur_avgs = []
    for lb in lookbacks:
        if not home_matches[home_team]:
            cur_avgs.append(0.5)
        else:
            cur_avgs.append(np.mean(home_matches[home_team][-lb:]))
    for lb in lookbacks:
        if not away_matches[away_team]:
            cur_avgs.append(0.3)
        else:
            cur_avgs.append(np.mean(away_matches[away_team][-lb:]))
    match_avgs[match_dict["match_api_id"]] = cur_avgs
    
    home_goal = match_dict["home_team_goal"]
    away_goal = match_dict["away_team_goal"]
    if home_goal > away_goal:
        winner = 1
    elif home_goal == away_goal:
        winner = 0
    else:
        winner = -1
    
    home_matches[home_team].append(winner)
    away_matches[away_team].append(-winner)
    
print(match_avgs[1992095])
print(home_matches[10192][-20:])

[0.2, 0.6, 0.55, 0.6, 0.6, 0.5]
[1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1]


We can add this new data to the end of the "match vector"

In [10]:
# Let's summarize "things known before the match" as a vector X
# and "the result" as an one-hot coded vector 
def match_vector (match_dict):
    team_list = ["home_team_api_id", "away_team_api_id"]
    player_list = ["home_player_1", "home_player_2", "home_player_3", "home_player_4",
               "home_player_5", "home_player_6", "home_player_7", "home_player_8",
               "home_player_9", "home_player_10", "home_player_11",
               "away_player_1", "away_player_2", "away_player_3", "away_player_4",
               "away_player_5", "away_player_6", "away_player_7", "away_player_8",
               "away_player_9", "away_player_10", "away_player_11"]
    
    home_goal = match_dict["home_team_goal"]
    away_goal = match_dict["away_team_goal"]
    if home_goal > away_goal:
        y = np.array([1, 0, 0])
    elif home_goal == away_goal:
        y = np.array([0, 1, 0])
    else:
        y = np.array([0, 0, 1])
    
    ans = []
    for team in team_list:
        ans += team_vector(match_dict[team], match_dict["date"])
    for player in player_list:
        ans += player_vector(match_dict[player], match_dict["date"])
    ans += match_avgs[match_dict["match_api_id"]]
    return (np.array(ans), y)

In [11]:
cur_match = matches[10006]
print(cur_match["home_team_goal"])
print(cur_match["away_team_goal"])
print(cur_match["date"])
print(cur_match["home_player_9"])
# print(player_vector(cur_match["home_player_9"], cur_match["date"]))
vec = match_vector(cur_match)
print(vec)
print(vec[0].shape)

1
2
2015-12-05 00:00:00
243164
(array([ 68.  ,  57.  ,  69.  ,  66.  ,  50.  ,  66.  ,  46.  ,  55.  ,
        71.  ,  40.  ,  70.  ,  44.  ,  49.  ,  65.  ,  71.  ,  40.  ,
       193.04, 196.  ,  81.  ,  81.  ,  14.  ,   8.  ,  15.  ,  32.  ,
        12.  ,  13.  ,  12.  ,  13.  ,  31.  ,  20.  ,  42.  ,  44.  ,
        44.  ,  79.  ,  47.  ,  22.  ,  73.  ,  43.  ,  76.  ,  11.  ,
        38.  ,  22.  ,   7.  ,  37.  ,  23.  ,  10.  ,  13.  ,  14.  ,
       172.72, 161.  ,  80.  ,  80.  ,  80.  ,  73.  ,  42.  ,  79.  ,
        65.  ,  87.  ,  74.  ,  67.  ,  70.  ,  84.  ,  84.  ,  76.  ,
        86.  ,  81.  ,  85.  ,  78.  ,  69.  ,  75.  ,  57.  ,  76.  ,
        75.  ,  72.  ,  78.  ,  78.  ,  66.  ,  67.  ,  64.  ,  64.  ,
       198.12, 196.  ,  84.  ,  84.  ,  45.  ,  57.  ,  93.  ,  74.  ,
        60.  ,  60.  ,  52.  ,  76.  ,  70.  ,  69.  ,  67.  ,  77.  ,
        57.  ,  83.  ,  42.  ,  92.  ,  46.  ,  62.  ,  84.  ,  82.  ,
        73.  ,  93.  ,  53.  ,  59.  ,  60.  

We now can turn each match into a vector of data known right before the match, and the outcome. Let's build a neural network that learns to predict the outcome.

In [12]:
import keras

from keras.models import Sequential
from keras.layers import Input, Activation, Dense
from keras.optimizers import Adam

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [14]:
matches_train = matches[10000:25000]
xs = []
ys = []
for i, match in enumerate(matches_train):
    if i % 100 == 0:
        print(i)
        
    try:
        (x, y) = match_vector(match)
        xs.append(x)
        ys.append(y)
    except ValueError:
        pass
    
x_train = np.array(xs[:-2000])
y_train = np.array(ys[:-2000])
x_test = np.array(xs[-2000:])
y_test = np.array(ys[-2000:])

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
(9204, 726)
(9204, 3)
(2000, 726)
(2000, 3)


In [15]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
x_train_std = ss.transform(x_train)
x_test_std = ss.transform(x_test)

In [39]:
model = Sequential()
model.add(Dense(128, input_shape=(726,)))
model.add(Activation("relu"))
model.add(Dense(3))
model.add(Activation("softmax"))

model.compile(loss = "categorical_crossentropy", optimizer = Adam(lr = 1e-5), metrics = ["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 128)               93056     
_________________________________________________________________
activation_11 (Activation)   (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 387       
_________________________________________________________________
activation_12 (Activation)   (None, 3)                 0         
Total params: 93,443
Trainable params: 93,443
Non-trainable params: 0
_________________________________________________________________


In [40]:
history = model.fit(x_train_std, y_train, batch_size=100, epochs=40, validation_split = 0.1)

Train on 8283 samples, validate on 921 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [42]:
model.evaluate(x_test_std, y_test)



[1.0472947301864624, 0.5175]