In [154]:
import pandas as pd
import numpy as np
from glob import glob
from random import randint

In [155]:
# This a parameter to adjust as required
MAX_MEMORY = 20

FIXED_FEATURE_COLUMNS = ["innings", "bat_position", "bat_avg", "bat_sr", "bat_arm", "bowl_avg", "bowl_sr", "bowl_type", "seam_factor", "spin_factor"]
DYNAMIC_FEATURE_COLUMNS = ["match_balls", "inn_balls", "team_wkts", "team_lead", "bat_score", "bat_balls", "bowl_balls", "bowl_runs", "bowl_wkts"]
data = pd.DataFrame(columns = FIXED_FEATURE_COLUMNS + DYNAMIC_FEATURE_COLUMNS + ["seq"])
Y = []

# Load batter-bowler sequences
path = "../data/interim/batter_bowler_sequences"
all_files = glob(path + "/*.csv")

for i,filename in enumerate(all_files):
    raw = pd.read_csv(filename)
    
    # Choose a random place in the sequence to predict from 
    seq_idx = randint(0, len(raw.index) - 1) if len(raw.index) > 1 else 0
    row = list(raw.iloc[seq_idx][FIXED_FEATURE_COLUMNS + DYNAMIC_FEATURE_COLUMNS])
    
    Y.append(raw.iloc[seq_idx]["outcome"])
    row.append(list(raw.iloc[:(seq_idx-1)]["outcome"]))
    
    data.loc[i] = row
    
# This code takes quite a while to run

KeyboardInterrupt: 

In [156]:
data

Unnamed: 0,innings,bat_position,bat_avg,bat_sr,bat_arm,bowl_avg,bowl_sr,bowl_type,seam_factor,spin_factor,match_balls,inn_balls,team_wkts,team_lead,bat_score,bat_balls,bowl_balls,bowl_runs,bowl_wkts,seq
0,1,3,38.02,44.70,Left-hand bat,26.95,53.7,rfm,28.815789,7.684211,43,43,2,18,11,20,19,8,1,"[0, 0, 0, 1, 0, 4, 0]"
1,1,3,38.02,44.70,Left-hand bat,27.76,48.4,lf,28.815789,7.684211,29,29,2,17,10,16,17,9,1,"[0, 0, 0, 0, 4, 0, 1, 0]"
2,1,11,14.00,43.71,Right-hand bat,27.76,48.4,lf,28.815789,7.684211,373,373,9,228,0,0,109,66,3,"[4, 0, 0]"
3,1,5,42.04,40.06,Right-hand bat,26.95,53.7,rfm,28.815789,7.684211,123,123,4,60,26,50,32,16,2,"[0, 0, 0, 0, 0, 3]"
4,1,5,42.04,40.06,Right-hand bat,27.76,48.4,lf,28.815789,7.684211,167,167,4,81,37,72,47,30,1,"[0, 4, 0, 0, 0, 1, 2, 0, 3, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10626,1,2,37.00,55.22,Right-hand bat,37.00,73.3,rfm,11.375000,21.125000,107,107,0,98,33,51,27,14,0,"[0, 1, 1, 0, 0, 1, 2, 0, 0, 0]"
10627,1,7,36.70,63.51,Right-hand bat,30.44,52.7,rf,11.375000,21.125000,567,567,5,332,16,34,93,56,0,"[2, 1, 0, 3, 3, 0, 0, 0]"
10628,1,7,36.70,63.51,Right-hand bat,29.83,37.4,rob,11.375000,21.125000,527,527,5,314,4,12,171,141,3,[]
10629,1,7,36.70,63.51,Right-hand bat,34.87,66.5,lf,11.375000,21.125000,554,554,5,329,14,26,137,60,1,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0]"


In [167]:
# Preprocess data
def one_hot(col):
    levels = list(set(col))
    encoded = np.zeros((len(col), len(levels)))
    
    for i,v in enumerate(col):
        encoded[i,levels.index(v)] = 1
    
    return encoded
    
def normalise(col):
    mmin = min(col)
    mmax = max(col)
    return [(float(i)-mmin)/(mmax-mmin) for i in col]
    

X = np.array((len(data.index, )))
CAT_COLS = ["innings", "bat_position", "bat_arm", "bowl_type"]
first = True
for c in data.columns[:-1]:
    new = one_hot(list(data[c])) if c in CAT_COLS else np.array(normalise(list(data[c])), ndmin = 2).transpose()
    if first:
        X = new
        first = False
    else:
        X = np.concatenate((X,new), axis=1)
        
# Pad sequences
padded = []
for seq in data["seq"]:
    row = [-1]*MAX_MEMORY
    
    for i,s in enumerate(reversed(seq)):
        if i > 20:
            break
        row[MAX_MEMORY-i-1] = 1 if s == "W" else 0
    
    padded.append(row)
padded = np.array(padded)
    
for i in range(0,MAX_MEMORY):
    X = np.concatenate((X, one_hot(padded[:,i])), axis=1)
 

# Encode output
Y = np.array([1 if y == "W" else 0 for y in Y])

In [170]:
Y = Y[:-1]

In [171]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

model = Sequential()

model.add(Dense(64, input_dim=X.shape[1]))

# Recurrent layer
#model.add(LSTM(64, return_sequences=False, 
#               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [172]:
model.fit(X,Y, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2471a7c0388>

In [141]:
# evaluate the keras model
_, accuracy = model.evaluate(X, Y)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 100.00
