### IPL 2023 Final: GT vs CSK - Interactive ML Model for Run & Wicket Prediction  

#### Data Collection & Preprocessing  
- Load match data for IPL 2023 Final  
- Handle missing values and clean the dataset  

#### Exploratory Data Analysis (EDA)  
- Visualize runs, wickets, and match trends  
- Understand key player contributions  

#### Feature Engineering  
- Create run rate and wicket-related features  
- Extract insights from past overs  

#### Model Selection & Training  
- Split data into training and test sets  
- Train a regression model for score prediction  

#### Interactive Prediction using ipywidgets  
- Build an interactive tool for predicting runs and wickets  
- Allow users to input overs, wickets, and other conditions  

#### Conclusion & Improvements  
- Analyze model performance  
- Suggest future improvements  


### Imports & Load dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

#to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth',None)

In [3]:
ipl_df = pd.read_csv('IPL_ball_by_ball_2024.csv')

### Sneak peak into the data:

In [5]:
ipl_df.head(5)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,1,1.0,,,,,,,,
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,


In [6]:
ipl_df.batting_team.unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Deccan Chargers', 'Mumbai Indians',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

In [7]:
ipl_df.innings.unique()

array([1, 2, 3, 4, 5, 6], dtype=int64)

In [8]:
ipl_df = ipl_df[(ipl_df.innings == 1) | (ipl_df.innings == 2)]

In [9]:
ipl_df.innings.unique()

array([1, 2], dtype=int64)

### Additional Columns:


In [11]:
#1. total runs in each ball
#2. is there a wicket fall in that ball

In [12]:
ipl_df['total_runs']= ipl_df.apply(lambda x: x['runs_off_bat'] + x['extras'], axis = 1)

In [13]:
ipl_df.head(20)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,total_runs
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,P Kumar,0,1,,,,1.0,,,,,,1
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,,0
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,1,1.0,,,,,,,,,1
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,,0
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,,0
5,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.6,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,0,,,,,,,,,,0
6,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.7,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,P Kumar,0,1,,,,1.0,,,,,,1
7,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,1.1,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,Z Khan,0,0,,,,,,,,,,0
8,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,1.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,Z Khan,4,0,,,,,,,,,,4
9,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,1.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,Z Khan,4,0,,,,,,,,,,4


In [14]:
type(ipl_df.player_dismissed[11])

float

In [15]:
ipl_df['isout']= ipl_df['player_dismissed'].apply(lambda x: 1 if type(x) == type('str') else 0)

### Choose Teams:

In [17]:
# t1 - bat first, t2 - bat second
# Ex: MI vs RCB, (match_id: 1254058)

t1= 'Gujarat Titans'
t2= 'Chennai Super Kings'

### Choose a match between these Teams:

In [19]:
ipl_df[(ipl_df.batting_team == t1) & (ipl_df.bowling_team == t2) & (ipl_df.innings ==1)].match_id.unique()

array([1370353, 1426297], dtype=int64)

In [20]:
matchID = 1370353 #Final match b/w GT vs CSK

In [21]:
mdf = ipl_df[ipl_df.match_id == matchID]

In [22]:
mdf.tail(5)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,total_runs,isout
243812,1370353,2023,2023-05-29,"Narendra Modi Stadium, Ahmedabad",2,14.2,Chennai Super Kings,Gujarat Titans,S Dube,RA Jadeja,MM Sharma,1,0,,,,,,,,,,1,0
243813,1370353,2023,2023-05-29,"Narendra Modi Stadium, Ahmedabad",2,14.3,Chennai Super Kings,Gujarat Titans,RA Jadeja,S Dube,MM Sharma,1,0,,,,,,,,,,1,0
243814,1370353,2023,2023-05-29,"Narendra Modi Stadium, Ahmedabad",2,14.4,Chennai Super Kings,Gujarat Titans,S Dube,RA Jadeja,MM Sharma,1,0,,,,,,,,,,1,0
243815,1370353,2023,2023-05-29,"Narendra Modi Stadium, Ahmedabad",2,14.5,Chennai Super Kings,Gujarat Titans,RA Jadeja,S Dube,MM Sharma,6,0,,,,,,,,,,6,0
243816,1370353,2023,2023-05-29,"Narendra Modi Stadium, Ahmedabad",2,14.6,Chennai Super Kings,Gujarat Titans,RA Jadeja,S Dube,MM Sharma,4,0,,,,,,,,,,4,0


In [23]:
mdf.reset_index(inplace = True, drop = True)

In [24]:
mdf.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,total_runs,isout
0,1370353,2023,2023-05-29,"Narendra Modi Stadium, Ahmedabad",1,0.1,Gujarat Titans,Chennai Super Kings,WP Saha,Shubman Gill,DL Chahar,0,0,,,,,,,,,,0,0
1,1370353,2023,2023-05-29,"Narendra Modi Stadium, Ahmedabad",1,0.2,Gujarat Titans,Chennai Super Kings,WP Saha,Shubman Gill,DL Chahar,0,0,,,,,,,,,,0,0
2,1370353,2023,2023-05-29,"Narendra Modi Stadium, Ahmedabad",1,0.3,Gujarat Titans,Chennai Super Kings,WP Saha,Shubman Gill,DL Chahar,1,0,,,,,,,,,,1,0
3,1370353,2023,2023-05-29,"Narendra Modi Stadium, Ahmedabad",1,0.4,Gujarat Titans,Chennai Super Kings,Shubman Gill,WP Saha,DL Chahar,1,0,,,,,,,,,,1,0
4,1370353,2023,2023-05-29,"Narendra Modi Stadium, Ahmedabad",1,0.5,Gujarat Titans,Chennai Super Kings,WP Saha,Shubman Gill,DL Chahar,1,0,,,,,,,,,,1,0


In [25]:
df1 = mdf[mdf.innings ==1]
df2 = mdf[mdf.innings ==2]

### Outcomes:

In [27]:
# 0,1,2,3,4,5,6,7, 'W', 'extras'
outcomes = [0,1,2,3,4,6,'w'] 

In [28]:
ipl_df[ipl_df.batting_team == t1].total_runs.value_counts()

total_runs
1    2447
0    1662
4     698
2     387
6     270
5      15
3      14
7       1
Name: count, dtype: int64

In [29]:
ipl_df[ipl_df.batting_team == t1].isout.sum()

247

In [30]:
t1_outcomes_count = ipl_df[ipl_df.batting_team == t1].total_runs.value_counts()
t2_outcomes_count = ipl_df[ipl_df.batting_team == t2].total_runs.value_counts()

In [31]:
t1_outs = ipl_df[ipl_df.batting_team == t1].isout.sum()
t2_outs = ipl_df[ipl_df.batting_team == t2].isout.sum()

In [32]:
t1_outcomes = []
t2_outcomes = []


for outcome in outcomes:
    if outcome !='w':
        t1_outcomes.append(t1_outcomes_count[outcome])
        t2_outcomes.append(t2_outcomes_count[outcome])
    else:
        t1_outcomes.append(t1_outs)
        t2_outcomes.append(t2_outs)
    

In [33]:
t1_outcomes, t2_outcomes

([1662, 2447, 387, 14, 698, 270, 247],
 [9676, 12105, 1955, 111, 3240, 1499, 1243])

In [34]:
t1_pb_outcomes = [i/sum(t1_outcomes) for i in t1_outcomes]
t2_pb_outcomes = [i/sum(t2_outcomes) for i in t2_outcomes]

In [35]:
t1_pb_outcomes

[0.2903056768558952,
 0.4274235807860262,
 0.06759825327510917,
 0.002445414847161572,
 0.12192139737991266,
 0.04716157205240175,
 0.04314410480349345]

In [36]:
# (0 --> 1)
# ex:
# dot - 34%
# ones - 34% + 39% = 73%

In [37]:
t1_pb_ls = np.cumsum(t1_pb_outcomes)
t2_pb_ls = np.cumsum(t2_pb_outcomes)

In [38]:
t1_pb_ls, t2_pb_ls

(array([0.29030568, 0.71772926, 0.78532751, 0.78777293, 0.90969432,
        0.9568559 , 1.        ]),
 array([0.32438231, 0.73019545, 0.79573569, 0.7994569 , 0.90807603,
        0.95832914, 1.        ]))

### Fetch probability values:

In [40]:
# GT = [0.29030568, 0.71772926, 0.78532751, 0.78777293, 0.90969432,0.9568559 , 1]
#CSK = [0.32438231, 0.73019545, 0.79573569, 0.7994569 , 0.90807603,0.95832914, 1]

In [41]:
def get_pbvalues(teamName):
    if teamName == 'GT':
        p_0 = 0.29030568
        p_1 = 0.71772926
        p_2 = 0.78532751
        p_3 = 0.78777293
        p_4 = 0.90969432
        p_6 = 0.9568559
        p_w = 1
        
    elif teamName == 'CSK':
        p_0 = 0.32438231
        p_1 = 0.73019545
        p_2 = 0.79573569
        p_3 = 0.7994569
        p_4 = 0.90807603
        p_6 = 0.95832914
        p_w = 1
        
    return p_0,p_1,p_2,p_3,p_4,p_6,p_w    

### Runs prediction Model:

In [45]:
def predict_runs(target, curr_score, curr_wickets, curr_overs):
    
    #pb values of both teams
    i1p_0, i1p_1, i1p_2, i1p_3, i1p_4, i1p_6, i1p_w = get_pbvalues('GT')
    i2p_0, i2p_1, i2p_2, i2p_3, i2p_4, i2p_6, i2p_w = get_pbvalues('CSK')
    
    pred_runs = curr_score
    pred_wks = curr_wickets
    leftover_balls = 120- curr_overs*6
    
    for i in range(leftover_balls):
        r_value = np.random.random()

        if r_value <= i2p_0:
            pred_runs += 0
        elif r_value <= i2p_1:
            pred_runs += 1
        elif r_value <= i2p_2:
            pred_runs += 2
        elif r_value <= i2p_3:
            pred_runs += 3
        elif r_value <= i2p_4:
            pred_runs += 4
        elif r_value <= i2p_6:
            pred_runs += 6
        else:
            pred_runs += 0
            pred_wks += 1

            if pred_wks == 10:
                break
        if pred_runs > target:
            break
            
            #print('pred_runs:', pred_runs)
           #print('pred_wks:', pred_wks)  
        
    return pred_runs          



In [47]:
# predict_runs(target, curr_score, curr_wickets, curr_overs)
predict_runs(183, 0, 0, 0)

120

### Winner function:

In [50]:
def get_win(pred_runs, target):
    if pred_runs > target:
        return 'win'
    elif pred_runs == target:
        return 'tie'
    else:
        return 'loss'
    

In [74]:
#runs, wickets, overs,
# win, 1st ing score

#GT = 180/8

target = 180

curr_score = 72
curr_wickets = 2
curr_overs = 10


iter_count = 100

runs_ls = []
results_ls = []

win_count = 0
tie_count = 0
loss_count = 0  

for i in range(iter_count):
    pred_runs = predict_runs(target, curr_score, curr_wickets, curr_overs)
    runs_ls.append(pred_runs)
    result_pred = get_win(pred_runs, target)
    results_ls.append(result_pred)
    
    if result_pred == 'win':
        win_count += 1
    elif result_pred == 'tie':
        tie_count += 1
    else:
        loss_count +=1

In [76]:
win_count, tie_count, loss_count

(1, 0, 99)

### Find out Runs at a required stage:

In [79]:
# find out runs:
def find_runs(curr_score, target, curr_wickets, at_overs):
    runs_ls = []
    results_ls = []
    
    req_runs = []
    win_ls = []
    
    for i in range(curr_score, target + 1):
        win_count = 0
        tie_count = 0
        loss_count = 0  
        
        for j in range(100):
            pred_runs = predict_runs(target, i, curr_wickets, at_overs)
            runs_ls.append(pred_runs)
            result_pred = get_win(pred_runs, target)
            results_ls.append(result_pred)

            if result_pred == 'win':
                win_count += 1
            elif result_pred == 'tie':
                tie_count += 1
            else:
                loss_count +=1

            win_ls.append(win_count)
            req_runs.append(i)
            #print('runs', i, 'win%:', win_count)

    required_runs = curr_score
    for i in range(len(req_runs)):
        if win_ls[i] >= 50:
            required_runs = req_runs[i]
            #print('Runs to be:', req_runs[i])
            break
            
    return required_runs

In [81]:
#find_runs(curr_score, target, curr_wickets, at_overs)
find_runs(72, 180, 2, 10 )

104

### Find out Wickets they can afford to lose:

In [84]:
def find_wickets(curr_score, target, curr_wickets, at_overs):
    
#find_runs(curr_score, target, curr_wickets, at_overs)
    req_runs = find_runs(curr_score, target, curr_wickets, at_overs)
    
    runs_ls = []
    results_ls = []
    
    req_wks = []
    win_ls = []
    
    for i in range(curr_wickets, 10):
        win_count = 0
        tie_count = 0
        loss_count = 0  
        
        for j in range(100):
#        pred_runs = predict_runs(target, i, curr_wickets, at_overs)
            pred_runs = predict_runs(target, curr_score, i, at_overs)
            runs_ls.append(pred_runs)
            result_pred = get_win(pred_runs, target)
            results_ls.append(result_pred)

            if result_pred == 'win':
                win_count += 1
            elif result_pred == 'tie':
                tie_count += 1
            else:
                loss_count +=1

        win_ls.append(win_count)
        req_wks.append(i)
        print('wickets', i, 'win%:', win_count)
        
    req_wicket_value = curr_wickets
    
    for i in range(len(req_wks)):
        if (win_ls[i] < 45) :
            req_wicket_value = req_wks[i]
            break
            
    return req_wicket_value

In [86]:
#wickets they can afford to lose:

# find_wickets(curr_score, target, curr_wickets, at_overs)
find_wickets(72, 180, 2, 10)

wickets 2 win%: 1
wickets 3 win%: 3
wickets 4 win%: 1
wickets 5 win%: 0
wickets 6 win%: 0
wickets 7 win%: 2
wickets 8 win%: 0
wickets 9 win%: 1


2

### Function: Interactive chart

In [89]:
%matplotlib inline
from ipywidgets import interactive
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np

def find_runs_wickets(curr_wks, at_overs, target_score):
    plt.figure(figsize = (16, 4))
    # x = np.linspace(-10, 10, num=1000)
    x = np.array(list(range(21)))
    req_value = find_runs(72, target_score, curr_wks, at_overs)
    req_wk_value = find_wickets(72, target_score, curr_wks, at_overs)

    if at_overs == 10:
        req_value = 72
        req_wk_value = 2

    
    # print(req_value)
    y = np.array([req_value for i in range(21)])

    # plt.plot(x, current_overs * x + target_score)
    # plt.plot(x, y)
    plt.scatter(at_overs, req_value, s = 1200, color = 'red')
    plt.axhline(target_score, ls = '--', color = 'blue')
    plt.text( 1, target_score + 10, 'Target Score :' + str(target_score) , color = 'darkblue', fontsize = 13)
    plt.text( at_overs, req_value, str(req_value) + '/' + str(req_wk_value), color = 'white', fontsize = 12,  horizontalalignment='center', verticalalignment='center')
    plt.text(at_overs, req_value - 30, 'CSK has to be at ' + str(req_value) + '/' +  str(req_wk_value) + ' after ' + str(at_overs) + ' ov', horizontalalignment='center')
    plt.ylim(50, target_score + 50)
    plt.xticks(x)
    plt.title('Where should CSK be?', fontsize = 20)
    plt.xlabel('Overs')
    plt.ylabel('Score')
    plt.show()

# x=widgets.IntSlider(min=-10, max=30, step=1, value=10)

# find_wickets(curr_score, target, curr_wickets, at_overs)
# find_wickets(87, 150, 1, 14)
 

### Interactive Visualization

In [92]:
from ipywidgets import interactive, IntSlider, Output, VBox
from IPython.display import display


In [94]:
print('current_score = CSK: 72/2 (10 overs)')
print('')

interactive_plot = interactive(find_runs_wickets, curr_wks = widgets.IntSlider(min=2, max = 10, step=1, value=1), at_overs=widgets.IntSlider(min=10, max=20, step=1, value=10), target_score= widgets.IntSlider(min=0, max=250, step=1, value=150))
output = interactive_plot.children[-1]
output.layout.height = '450px'
interactive_plot

current_score = CSK: 72/2 (10 overs)



interactive(children=(IntSlider(value=2, description='curr_wks', max=10, min=2), IntSlider(value=10, descripti…