In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mp
from pylab import show
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading Data
matches = pd.read_csv('matches.csv')
deliveries = pd.read_csv('deliveries.csv')

In [None]:
matches = matches.replace(np.nan, '', regex=True)
deliveries = deliveries.replace(np.nan, '', regex=True)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

total_and_balls = pd.DataFrame(deliveries)

indexNames = total_and_balls[ total_and_balls['is_super_over'] == 1 ].index
total_and_balls.drop(indexNames , inplace=True)

labelencoder = LabelEncoder()

total_and_balls['batsman'] = labelencoder.fit_transform(total_and_balls['batsman'])
labelencoder.fit_transform(total_and_balls['batsman'])

total_and_balls['bowler'] = labelencoder.fit_transform(total_and_balls['bowler'])
labelencoder.fit_transform(total_and_balls['bowler'])

total_and_balls['non_striker'] = labelencoder.fit_transform(total_and_balls['non_striker'])
labelencoder.fit_transform(total_and_balls['non_striker'])

total_and_balls = total_and_balls.drop(['is_super_over', 'wide_runs','bye_runs','legbye_runs','noball_runs','penalty_runs','batsman_runs','extra_runs','player_dismissed','dismissal_kind','fielder'], axis=1)

for id in range(1,637):
    df = total_and_balls.loc[total_and_balls['match_id'] == id]
    total_score = df.loc[df.index[-1], "score"]
    total_and_balls.loc[total_and_balls['match_id'] == id, 'final_score']= total_score

pca = PCA().fit(total_and_balls)

In [None]:
print(pca.components_)

In [None]:
eval_match = total_and_balls.loc[total_and_balls['match_id'] == 7]
indexNames = total_and_balls[ total_and_balls['match_id'] == 7 ].index
total_and_balls.drop(indexNames , inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

X = total_and_balls.values[:,:-1]
y = total_and_balls.values[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(n_jobs=-1)
cross_val_acc = cross_val_score(model, X_train,y_train, cv=10)

pred = model.fit(X_train,y_train).predict(X_train)
train_acc = r2_score(y_train, pred)
train_acc_mean = mean_squared_error(y_train, pred)
print(f'R square score of the model on train data: {train_acc}')
print(f'Mean squared error of the model on train data: {train_acc_mean}')
print()
print(f'Cross Validation Accuracy of the model: {cross_val_acc.mean()}')
print()

In [None]:
lin = model.fit(X_train,y_train)
pred = lin.predict(X_test)
test_acc = r2_score(y_test, pred)
test_acc_mean = mean_squared_error(y_test, pred)
print(f'R square score of the model on test data: {test_acc}')
print(f'Mean squared error of the model on test data: {test_acc_mean}')

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Evaluate cross validation score for Neighbors in range 1-20
accuracies = []
neighbors = []
train_accuracies = []

for index in range(1,21):
        neighbors.append(index)

for neighbor in neighbors:
    model = KNeighborsRegressor(n_neighbors=neighbor, weights='distance', p=2, n_jobs=-1)
    acc = cross_val_score(model, X_train,y_train, cv=10)
    pred = model.fit(X_train,y_train).predict(X_train)
    train_acc = r2_score(y_train, pred)
    accuracies.append(acc.mean())
    train_accuracies.append(train_acc)

In [None]:
tck = []
for i in range(1,21):
    tck.append(i)
fig= mp.figure(figsize=(8,4))
mp.grid()
mp.plot(neighbors,accuracies, c='blue', label='Cross Validation Accuracy')
mp.plot(neighbors,train_accuracies, c='red', label='Training R2 Score')
mp.scatter(neighbors,accuracies)
mp.title('K Neighbours Regressors')
mp.xlabel("No of Neighbors")
mp.ylabel("Accuracy")
mp.xticks(tck)
mp.legend()
show()
max_acc = max(accuracies)
max_n = neighbors[accuracies.index(max_acc)]
print(f'Maximum Cross Validation accuracy - Neighbors: {max_n}, Accuracy: {max_acc}')

In [None]:
model = KNeighborsRegressor(n_neighbors=2, weights='distance', p=2, n_jobs=-1)
knr = model.fit(X_train,y_train)
pred = knr.predict(X_test)
test_acc = r2_score(y_test, pred)
test_acc_mean = mean_squared_error(y_test, pred)
print(f'R square score of the model on test data: {test_acc}')
print(f'Mean squared error of the model on test data: {test_acc_mean}')

In [None]:
from sklearn.ensemble import RandomForestRegressor

estimators = [10,50,100,1000]
depths = [10,100]
accuracies = []
outputs = []
train_accuracies = []

for estimator in estimators:
    for depth in depths:
        model = RandomForestRegressor(n_estimators = estimator, max_depth=depth, random_state=0, n_jobs=-1)
        acc = cross_val_score(model, X_train,y_train, cv=10)
        accuracies.append(acc.mean())
        outputs.append((estimator,depth))
        pred = model.fit(X_train,y_train).predict(X_train)
        train_acc = r2_score(y_train, pred)
        train_accuracies.append(train_accuracies)
        print(f'Estimators: {estimator}, Max Dept: {depth}, Training Accuracy: {train_acc}, Cross Validation Accuracy: {acc.mean()}')
        
max_acc = max(accuracies)
index = accuracies.index(max_acc)
est = outputs[index][0]
dep = outputs[index][1]
print(f'Maximum Cross Validation Accuracy: {max_acc}, No of estimators: {est}, Max Depth: {dep}')

In [None]:
from mpl_toolkits.mplot3d import Axes3D
op_array = []
ests = []
depths = []
for op in outputs:
    l = list(op) 
    ests.append(l[0])
    depths.append(l[1])
fig = mp.figure(figsize=(10,5))
ax = mp.axes(projection="3d")
ax.plot3D(ests, depths, accuracies)
ax.scatter3D(ests, depths, accuracies)
ax.set_xlabel('No of Estimators')
ax.set_ylabel('Max Depth')
ax.set_zlabel('Accuracy')
ax.set_title('Random Forest Regressors')
mp.show()

In [None]:
model = RandomForestRegressor(n_estimators =1000, max_depth=100, random_state=0)
rfr = model.fit(X_train,y_train)
pred = rfr.predict(X_test)
test_acc = r2_score(y_test, pred)
test_acc_mean = mean_squared_error(y_test, pred)
print(f'R square score of the model on test data: {test_acc}')
print(f'Mean squared error of the model on test data: {test_acc_mean}')

In [None]:
model.feature_importances_

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
total_and_balls_scaled = scaler.fit_transform(total_and_balls)

X_nn = total_and_balls_scaled[:,:-1]
y_nn = total_and_balls_scaled[:,-1]

X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_nn, y_nn, test_size=0.2, random_state=1)

In [None]:
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import SimpleRNN

X_trains = X_train_nn.reshape((X_train_nn.shape[0], 1, X_train_nn.shape[1]))
X_tests = X_test_nn.reshape((X_test_nn.shape[0], 1, X_test_nn.shape[1]))

regressor = Sequential()
regressor.add(LSTM(50, input_shape=(X_trains.shape[1], X_trains.shape[2])))
regressor.add(Dense(50, activation='relu'))
regressor.add(Dense(50, activation='relu'))
regressor.add(Dense(1))
regressor.compile(loss='mean_squared_error', optimizer='adam')
history1 = regressor.fit(X_trains, y_train_nn, epochs=50, batch_size=32, verbose=1)


In [None]:
regressor2 = Sequential()
regressor2.add(LSTM(100, input_shape=(X_trains.shape[1], X_trains.shape[2])))
regressor2.add(Dense(50, activation='relu'))
regressor2.add(Dense(50, activation='relu'))
regressor2.add(Dense(1))
regressor2.compile(loss='mean_squared_error', optimizer='adam')
history2 = regressor2.fit(X_trains, y_train_nn, epochs=50, batch_size=32, verbose=1)


In [None]:
regressor3 = Sequential()
regressor3.add(LSTM(50, input_shape=(X_trains.shape[1], X_trains.shape[2])))
regressor3.add(Dense(1))
regressor3.compile(loss='mean_squared_error', optimizer='adam')
history3 = regressor3.fit(X_trains, y_train_nn, epochs=50, batch_size=32, verbose=1)


In [None]:
regressor4 = Sequential()
regressor4.add(LSTM(100, input_shape=(X_trains.shape[1], X_trains.shape[2])))
regressor4.add(Dense(1))
regressor4.compile(loss='mean_squared_error', optimizer='adam')
history4 = regressor4.fit(X_trains, y_train_nn, epochs=50, batch_size=32, verbose=1)


In [None]:
regressor5 = Sequential()
regressor5.add(LSTM(50, input_shape=(X_trains.shape[1], X_trains.shape[2]), return_sequences=True))
regressor5.add(LSTM(50))
regressor5.add(Dense(1))
regressor5.compile(loss='mean_squared_error', optimizer='adam')
history5 = regressor5.fit(X_trains, y_train_nn, epochs=50, batch_size=32, verbose=1)


In [None]:
regressor6 = Sequential()
regressor6.add(LSTM(100, input_shape=(X_trains.shape[1], X_trains.shape[2]), return_sequences=True))
regressor6.add(LSTM(100))
regressor6.add(Dense(1))
regressor6.compile(loss='mean_squared_error', optimizer='adam')
history6 = regressor6.fit(X_trains, y_train_nn, epochs=50, batch_size=32, verbose=1)


In [None]:
regressor7 = Sequential()
regressor7.add(LSTM(100, input_shape=(X_trains.shape[1], X_trains.shape[2])))
regressor7.add(Dense(100, activation='relu'))
regressor7.add(Dense(100, activation='relu'))
regressor7.add(Dense(100, activation='relu'))
regressor7.add(Dense(1))
regressor7.compile(loss='mean_squared_error', optimizer='adam')
history7 = regressor7.fit(X_trains, y_train_nn, epochs=50, batch_size=32, verbose=1)


In [None]:
regressor8 = Sequential()
regressor8.add(LSTM(200, input_shape=(X_trains.shape[1], X_trains.shape[2])))
regressor8.add(Dense(100, activation='relu'))
regressor8.add(Dense(100, activation='relu'))
regressor8.add(Dense(100, activation='relu'))
regressor8.add(Dense(1))
regressor8.compile(loss='mean_squared_error', optimizer='adam')
history8 = regressor8.fit(X_trains, y_train_nn, epochs=50, batch_size=32, verbose=1)


In [None]:
regressor9 = Sequential()
regressor9.add(LSTM(200, input_shape=(X_trains.shape[1], X_trains.shape[2])))
regressor9.add(Dense(200, activation='relu'))
regressor9.add(Dense(200, activation='relu'))
regressor9.add(Dense(200, activation='relu'))
regressor9.add(Dense(1))
regressor9.compile(loss='mean_squared_error', optimizer='adam')
history9 = regressor9.fit(X_trains, y_train_nn, epochs=50, batch_size=32, verbose=1)


In [None]:
regressor10 = Sequential()
regressor10.add(LSTM(300, input_shape=(X_trains.shape[1], X_trains.shape[2])))
regressor10.add(Dense(200, activation='relu'))
regressor10.add(Dense(200, activation='relu'))
regressor10.add(Dense(200, activation='relu'))
regressor10.add(Dense(1))
regressor10.compile(loss='mean_squared_error', optimizer='adam')
history10 = regressor10.fit(X_trains, y_train_nn, epochs=50, batch_size=32, verbose=1)


In [None]:
print(history1.history.keys())

fig= mp.figure(figsize=(12,6))
mp.grid()
mp.plot(history1.history['loss'], label='One L50Two D50', c='#1f77b4')
mp.plot(history2.history['loss'], label='One L100 Two D50', c='#ff7f0e')
mp.plot(history3.history['loss'], label='One L50', c='#2ca02c')
mp.plot(history4.history['loss'], label='One L100', c='#d62728')
mp.plot(history5.history['loss'], label='Two L50', c='#9467bd')
mp.plot(history6.history['loss'], label='Two L100', c='#8c564b')
mp.plot(history7.history['loss'], label='One L100 Three D100', c='#e377c2')
mp.plot(history8.history['loss'], label='One L200 Three D100', c='#7f7f7f')
mp.plot(history9.history['loss'], label='One L200 Three D200', c='#bcbd22')
mp.plot(history10.history['loss'], label='One L300 Three D200', c='#17becf')
mp.title('Training accuracy')
mp.ylabel('Mean Squared Error Loss')
mp.xlabel('No of Epochs')
mp.legend()
mp.show()

In [None]:
pred = regressor8.predict(X_tests)
test_acc = r2_score(y_test_nn, pred)
test_acc_mean = mean_squared_error(y_test_nn, pred)
print(f'R square score of the model on test data: {test_acc}')
print(f'Mean squared error of the model on test data: {test_acc_mean}')


In [None]:
fig= mp.figure(figsize=(12,6))

eval_match = eval_match.loc[eval_match['inning'] == 1]
X_eval = eval_match.values[:,:-1]
y_eval = eval_match.values[:,-1]

balls = []
for index in range(1,len(X_eval)+1):
    balls.append(index)
    
total_deliveries = len(X_eval)
runrate_proj = []
for index in range(len(X_eval)):
    runs = X_eval[index][-1]
    runrate = runs/(index+1)
    runrate_proj.append(runrate*total_deliveries)
    
pred_lin = lin.predict(X_eval)

pred_knr = knr.predict(X_eval)

pred_rfr = rfr.predict(X_eval)

X_evals_scaled = scaler.fit_transform(X_eval)
X_evals = X_evals_scaled.reshape((X_evals_scaled.shape[0], 1, X_evals_scaled.shape[1]))
pred_rnn_norm = regressor8.predict(X_evals)
pred_rnn = []
#scale back predicted value to normal scale
min_y = min(y_train)
max_y = max(y_train)
diff = max_y - min_y
for i in  range(len(pred_rnn_norm)):
    val = (pred_rnn_norm * diff) + min_y
    pred_rnn.append(val)


    
mp.grid()
mp.xlabel('Deliveries Bowled')
mp.ylabel('Predicted Final Innings Total')
mp.title('Score Projection - Mumbai Indians vs Kolkata Knight Riders 2017 Season')
mp.plot(balls,pred_lin,c='yellow', label='Linear Regression')
mp.plot(balls,pred_knr,c='red', label='K Nearest Neighbor Regressor')
mp.plot(balls,pred_rfr,c='blue', label='Random Forest Regressor')
mp.plot(balls,pred_rnn[0],c='green', label='LSTM - RNN')
mp.plot(balls,runrate_proj,c='orange', label='Run Rate Projection')
mp.plot(balls,y_eval,c='violet', label='Actual Final Score')
mp.xticks(np.arange(0, max(balls)+1, 5))
mp.legend()
show()
