In [1]:
import math
import numpy as np
import pandas as pd

In [2]:
df_deliveries = pd.read_csv('../csv_files/deliveries.csv')
pd.options.display.max_columns = None

In [3]:
df_deliveries.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [4]:
df_deliveries[['wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type','other_player_dismissed']] = df_deliveries[['wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type','other_player_dismissed']].fillna(0)

In [5]:
df_deliveries.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
1,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.2,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,6,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
2,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.3,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,1,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
3,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.4,England,New Zealand,DJ Malan,JM Bairstow,TA Boult,1,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
4,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.5,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,4,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0


In [6]:
df_deliveries.isna().sum()

match_id                  0
season                    0
start_date                0
venue                     0
innings                   0
ball                      0
batting_team              0
bowling_team              0
striker                   0
non_striker               0
bowler                    0
runs_off_bat              0
extras                    0
wides                     0
noballs                   0
byes                      0
legbyes                   0
penalty                   0
wicket_type               0
player_dismissed          0
other_wicket_type         0
other_player_dismissed    0
dtype: int64

In [7]:
partnership_runs = 0
current_innings = None

def is_wicket(row):
    return row['wicket_type'] != 0

for index, row in df_deliveries.iterrows():
    # if innings has changed
    if row['innings'] != current_innings:
        current_innings = row['innings']
        partnership_runs = 0
        
    # if wicket has fallen
    if is_wicket(row):  
        df_deliveries.at[index, 'partnership_runs'] = partnership_runs
        partnership_runs = 0
        continue
    else:
        partnership_runs += row['runs_off_bat']
        
    df_deliveries.at[index, 'partnership_runs'] = partnership_runs


In [8]:
df_deliveries.head(100)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,partnership_runs
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0
1,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.2,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,6,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,6.0
2,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.3,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,1,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,7.0
3,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.4,England,New Zealand,DJ Malan,JM Bairstow,TA Boult,1,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,8.0
4,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.5,England,New Zealand,JM Bairstow,DJ Malan,TA Boult,4,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,15.4,England,New Zealand,JE Root,HC Brook,JDS Neesham,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,14.0
96,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,15.5,England,New Zealand,JE Root,HC Brook,JDS Neesham,1,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,15.0
97,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,15.6,England,New Zealand,HC Brook,JE Root,JDS Neesham,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,15.0
98,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,16.1,England,New Zealand,JE Root,HC Brook,R Ravindra,1,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,16.0


In [9]:
df_deliveries = df_deliveries[df_deliveries['wicket_type'] != 0]

In [10]:
df_deliveries.head(100)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,partnership_runs
47,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,7.4,England,New Zealand,DJ Malan,JM Bairstow,MJ Henry,0,0,0.0,0.0,0.0,0.0,0.0,caught,DJ Malan,0.0,0.0,38.0
78,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,12.5,England,New Zealand,JM Bairstow,JE Root,MJ Santner,0,0,0.0,0.0,0.0,0.0,0.0,caught,JM Bairstow,0.0,0.0,24.0
103,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,16.6,England,New Zealand,HC Brook,JE Root,R Ravindra,0,0,0.0,0.0,0.0,0.0,0.0,caught,HC Brook,0.0,0.0,30.0
130,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,21.2,England,New Zealand,MM Ali,JE Root,GD Phillips,0,0,0.0,0.0,0.0,0.0,0.0,bowled,MM Ali,0.0,0.0,23.0
202,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,33.2,England,New Zealand,JC Buttler,JE Root,MJ Henry,0,0,0.0,0.0,0.0,0.0,0.0,caught,JC Buttler,0.0,0.0,70.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3529,7,2023/24,2023-10-10,"Himachal Pradesh Cricket Association Stadium, ...",1,44.3,England,Bangladesh,HC Brook,SM Curran,Mahedi Hasan,0,0,0.0,0.0,0.0,0.0,0.0,caught,HC Brook,0.0,0.0,20.0
3542,7,2023/24,2023-10-10,"Himachal Pradesh Cricket Association Stadium, ...",1,46.4,England,Bangladesh,SM Curran,CR Woakes,Mahedi Hasan,0,0,0.0,0.0,0.0,0.0,0.0,caught,SM Curran,0.0,0.0,7.0
3553,7,2023/24,2023-10-10,"Himachal Pradesh Cricket Association Stadium, ...",1,48.3,England,Bangladesh,AU Rashid,CR Woakes,Mahedi Hasan,0,0,0.0,0.0,0.0,0.0,0.0,caught,AU Rashid,0.0,0.0,18.0
3558,7,2023/24,2023-10-10,"Himachal Pradesh Cricket Association Stadium, ...",1,49.2,England,Bangladesh,CR Woakes,MA Wood,Taskin Ahmed,0,0,0.0,0.0,0.0,0.0,0.0,caught,CR Woakes,0.0,0.0,10.0


In [11]:
df_deliveries = df_deliveries.drop(['innings','season','start_date','batting_team', 'bowling_team', 'ball','runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty','other_wicket_type','other_player_dismissed','player_dismissed','match_id'],axis='columns')

In [12]:
df_deliveries.columns

Index(['venue', 'striker', 'non_striker', 'bowler', 'wicket_type',
       'partnership_runs'],
      dtype='object')

In [13]:
cat = [col for col in df_deliveries if df_deliveries[col].dtype == 'object']
cat

['venue', 'striker', 'non_striker', 'bowler', 'wicket_type']

In [14]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [15]:
mapping = {}

for column in cat:
    df_deliveries[column] = le.fit_transform(df_deliveries[column])
    mapping[column] = dict(zip(le.classes_, le.transform(le.classes_)))

In [16]:
for column_name in cat:
    print(f'Mapping for column "{column_name}":')
    for key, value in mapping[column_name].items():
        print(f'{key}: {value}')
    print()


Mapping for column "venue":
Arun Jaitley Stadium, Delhi: 0
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow: 1
Eden Gardens, Kolkata: 2
Himachal Pradesh Cricket Association Stadium, Dharamsala: 3
M Chinnaswamy Stadium, Bengaluru: 4
MA Chidambaram Stadium, Chepauk, Chennai: 5
Maharashtra Cricket Association Stadium, Pune: 6
Narendra Modi Stadium, Ahmedabad: 7
Rajiv Gandhi International Stadium, Uppal, Hyderabad: 8
Wankhede Stadium, Mumbai: 9

Mapping for column "striker":
A Dutt: 0
A Zampa: 1
AAP Atkinson: 2
AD Mathews: 3
AK Markram: 4
AT Carey: 5
AT Nidamanuru: 6
AU Rashid: 7
Abdullah Shafique: 8
Azmatullah Omarzai: 9
BA Stokes: 10
BFW de Leede: 11
BKG Mendis: 12
Babar Azam: 13
C Green: 14
C Karunaratne: 15
CAK Rajitha: 16
CBRLS Kumara: 17
CN Ackermann: 18
CR Woakes: 19
D Madushanka: 20
DA Miller: 21
DA Warner: 22
DJ Malan: 23
DJ Mitchell: 24
DJ Willey: 25
DM de Silva: 26
DN Wellalage: 27
DP Conway: 28
FDM Karunaratne: 29
Fakhar Zaman: 30
Fazalhaq Farooqi: 31
G Coe

In [20]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Models-------------------------
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb


In [18]:
X = df_deliveries[['venue', 'striker',
       'non_striker', 'bowler', 'wicket_type']]
y = df_deliveries['partnership_runs']

In [19]:

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30, shuffle=True)

In [20]:
# size_scaler = preprocessing.StandardScaler().fit(X_train)
# X_train_scaled = size_scaler.transform(X_train)
# X_test_scaled = size_scaler.transform(X_test)
# X_train_scaled.shape, X_test_scaled.shape

In [22]:
model_dict = {
    'LinearRegression': {"model": LinearRegression(), "params": {}},
    'RandomForestRegressor': {"model": RandomForestRegressor(),
                     "params": {'n_estimators': list(range(5, 100, 5)), 'max_depth': list(range(1, 40, 2))}},
    'XGBRegressor': {"model": XGBRegressor(), "params": {'n_estimators': list(range(10, 800, 50)), 'learning_rate': [0.001, 0.01, 0.1]}},
    'PolynomialFeatures': {"model": make_pipeline(PolynomialFeatures(), LinearRegression()),
                      "params": {'polynomialfeatures__degree': [10,2,3,4,5,6,7,8,9]}},
    'CatBoostRegressor': {"model": CatBoostRegressor(iterations=100, learning_rate=0.01), "params": {}},
    'LGBMRegressor': {"model": lgb.LGBMRegressor(learning_rate=0.01), "params": {}}
}


In [23]:
from sklearn.model_selection import cross_val_score

def eval_models_with_cv():
    model_results = pd.DataFrame()
    model_results['Train_RMSE'] = None
    model_results['Test_RMSE'] = None
    model_results['Train_MAE'] = None
    model_results['Test_MAE'] = None
    model_results['best_params'] = None
    best_test_score = math.inf

    for model_name, reg_model in model_dict.items():
        classifier = GridSearchCV(reg_model['model'], reg_model['params'], n_jobs=20, verbose=0)
        classifier.fit(X_train, y_train)
        best_model = classifier.best_estimator_

        y_train_predicted = best_model.predict(X_train)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_predicted))
        train_mae = mean_absolute_error(y_train, y_train_predicted)

        print(model_name, train_rmse, classifier.best_params_)

        # Cross-validation
        cv_rmse = np.sqrt(-cross_val_score(best_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)).mean()
        cv_mae = -cross_val_score(best_model, X_train, y_train, scoring='neg_mean_absolute_error', cv=5).mean()

        y_predicted = best_model.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
        test_mae = mean_absolute_error(y_test, y_predicted)

        if test_rmse < best_test_score:
            best_test_score = test_rmse
            best_reg_model_ours = best_model

        model_results.loc[model_name, ['Train_RMSE', 'Test_RMSE', 'Train_MAE', 'Test_MAE', 'best_params']] = [train_rmse, test_rmse, train_mae, test_mae, classifier.best_params_]

        model_results.loc[model_name, 'CV_RMSE'] = cv_rmse
        model_results.loc[model_name, 'CV_MAE'] = cv_mae

    print("Best model: ", best_reg_model_ours)
    y_predicted = best_reg_model_ours.predict(X_test)

    return model_results, best_reg_model_ours


In [24]:
model_results,best_reg_model_ours = eval_models_with_cv()
model_results

LinearRegression 34.65908769120329 {}
RandomForestRegressor 34.588527989791736 {'max_depth': 1, 'n_estimators': 10}
XGBRegressor 34.21890709807862 {'learning_rate': 0.001, 'n_estimators': 60}
PolynomialFeatures 33.75669064632213 {'polynomialfeatures__degree': 2}
0:	learn: 35.0169557	total: 148ms	remaining: 14.6s
1:	learn: 34.9803756	total: 150ms	remaining: 7.35s
2:	learn: 34.9468335	total: 152ms	remaining: 4.91s
3:	learn: 34.9311856	total: 153ms	remaining: 3.67s
4:	learn: 34.8959176	total: 154ms	remaining: 2.93s
5:	learn: 34.8593216	total: 155ms	remaining: 2.44s
6:	learn: 34.8328061	total: 157ms	remaining: 2.08s
7:	learn: 34.7987063	total: 158ms	remaining: 1.81s
8:	learn: 34.7622622	total: 160ms	remaining: 1.61s
9:	learn: 34.7376607	total: 161ms	remaining: 1.45s
10:	learn: 34.6896536	total: 162ms	remaining: 1.31s
11:	learn: 34.6556633	total: 163ms	remaining: 1.2s
12:	learn: 34.6240881	total: 164ms	remaining: 1.1s
13:	learn: 34.5935585	total: 165ms	remaining: 1.01s
14:	learn: 34.5720133

Unnamed: 0,Train_RMSE,Test_RMSE,Train_MAE,Test_MAE,best_params,CV_RMSE,CV_MAE
LinearRegression,34.659088,35.216249,23.936419,25.297031,{},34.890614,24.674784
RandomForestRegressor,34.588528,35.139742,24.083427,25.461369,"{'max_depth': 1, 'n_estimators': 10}",35.002275,25.054476
XGBRegressor,34.218907,34.888726,23.840794,25.408988,"{'learning_rate': 0.001, 'n_estimators': 60}",34.623219,24.668728
PolynomialFeatures,33.756691,35.596403,23.106571,26.249535,{'polynomialfeatures__degree': 2},35.003618,24.533398
CatBoostRegressor,32.577972,34.814018,22.456021,25.214043,{},34.718769,24.460518
LGBMRegressor,31.472982,35.051249,21.339451,25.183157,{},34.910357,24.425512


In [25]:
print(best_reg_model_ours)

<catboost.core.CatBoostRegressor object at 0x0000020D72C43350>


In [26]:
y_predicted = best_reg_model_ours.predict(X_test)

In [27]:
classifier=best_reg_model_ours
classifier.fit(X_train,y_train)

0:	learn: 35.0169557	total: 1.07ms	remaining: 106ms
1:	learn: 34.9803756	total: 1.98ms	remaining: 97.1ms
2:	learn: 34.9468335	total: 2.54ms	remaining: 82ms
3:	learn: 34.9311856	total: 3.02ms	remaining: 72.6ms
4:	learn: 34.8959176	total: 3.58ms	remaining: 68ms
5:	learn: 34.8593216	total: 4.12ms	remaining: 64.6ms
6:	learn: 34.8328061	total: 4.72ms	remaining: 62.8ms
7:	learn: 34.7987063	total: 5.25ms	remaining: 60.4ms
8:	learn: 34.7622622	total: 5.99ms	remaining: 60.6ms
9:	learn: 34.7376607	total: 6.48ms	remaining: 58.3ms
10:	learn: 34.6896536	total: 6.96ms	remaining: 56.3ms
11:	learn: 34.6556633	total: 7.48ms	remaining: 54.8ms
12:	learn: 34.6240881	total: 8.13ms	remaining: 54.4ms
13:	learn: 34.5935585	total: 8.74ms	remaining: 53.7ms
14:	learn: 34.5720133	total: 9.23ms	remaining: 52.3ms
15:	learn: 34.5349757	total: 9.95ms	remaining: 52.3ms
16:	learn: 34.5148503	total: 10.5ms	remaining: 51.3ms
17:	learn: 34.4872757	total: 11.3ms	remaining: 51.3ms
18:	learn: 34.4666196	total: 11.8ms	remaini

<catboost.core.CatBoostRegressor at 0x20d72c43350>

In [31]:
from tensorflow import keras
import keras.layers as tfl
from keras.layers import BatchNormalization

In [32]:
inputs=tfl.Input(shape=(5,))
x=tfl.Dense(512,activation='relu')(inputs)
x=BatchNormalization()(x)
x=tfl.Dense(256,activation='relu')(x)
x=BatchNormalization()(x)
x=tfl.Dense(128,activation='relu')(x)
x=BatchNormalization()(x)
x=tfl.Dense(64,activation='relu')(x)
x=BatchNormalization()(x)
x=tfl.Dense(32,activation='relu')(x)
x=BatchNormalization()(x)
x=tfl.Dense(16,activation='relu')(x)
x=BatchNormalization()(x)
x=tfl.Dense(8,activation='relu')(x)
x=BatchNormalization()(x)
x=tfl.Dense(4,activation='relu')(x)
x=BatchNormalization()(x)
outputs=tfl.Dense(1,activation='linear')(x)
classifier_dnn=keras.Model(inputs,outputs)

classifier_dnn.compile(
    loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error']
)

classifier_dnn.fit(X_train, y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x20d00aee690>

In [28]:
import pickle
pickle_out = open("../pickle_files/run_partnership.pkl","wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()