In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
df = pd.read_csv("match_data_23-24.csv")

In [3]:
numeric_df = df.select_dtypes(include=[np.number])
np.isinf(numeric_df).any()

Match No.                             False
Innings No.                           False
Over                                  False
Runs by Batsman                       False
Total Runs (Ball)                     False
Runs Conceded by Bowler               False
Cumulative Runs by Batsman            False
Cumulative Runs Conceded by Bowler    False
Cumulative Team Runs                  False
dtype: bool

In [4]:
# Convert 'Date' to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract year
df['Year'] = df['Date'].dt.year

# Create a unique integer Match ID using factorize on (Year, Match No.)
df['Match ID'], _ = pd.factorize(df[['Year', 'Match No.']].apply(tuple, axis=1))

df.columns

Index(['Venue', 'Date', 'Match No.', 'Innings No.', 'Batting Team',
       'Bowling Team', 'Over', 'Striker', 'Non Striker', 'Bowler',
       'Runs by Batsman', 'Extras', 'Total Runs (Ball)',
       'Runs Conceded by Bowler', 'Cumulative Runs by Batsman',
       'Cumulative Runs Conceded by Bowler', 'Cumulative Team Runs', 'Year',
       'Match ID'],
      dtype='object')

In [5]:
df['pair'] = df[['Striker', 'Non Striker']].apply(lambda row: tuple(sorted(row)), axis=1)

df['prev_pair'] = df['pair'].shift(1)
df['prev_innings'] = df['Innings No.'].shift(1)

df['is_wicket'] = (
    (df['Innings No.'] == df['prev_innings']) &
    (df['pair'] != df['prev_pair'])
).astype(int)

print(df.columns)
df.head()

Index(['Venue', 'Date', 'Match No.', 'Innings No.', 'Batting Team',
       'Bowling Team', 'Over', 'Striker', 'Non Striker', 'Bowler',
       'Runs by Batsman', 'Extras', 'Total Runs (Ball)',
       'Runs Conceded by Bowler', 'Cumulative Runs by Batsman',
       'Cumulative Runs Conceded by Bowler', 'Cumulative Team Runs', 'Year',
       'Match ID', 'pair', 'prev_pair', 'prev_innings', 'is_wicket'],
      dtype='object')


Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Runs Conceded by Bowler,Cumulative Runs by Batsman,Cumulative Runs Conceded by Bowler,Cumulative Team Runs,Year,Match ID,pair,prev_pair,prev_innings,is_wicket
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0,0,0,2023,0,"(DP Conway, RD Gaikwad)",,,0
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0,0,1,2023,0,"(DP Conway, RD Gaikwad)","(DP Conway, RD Gaikwad)",1.0,0
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,0,0,0,1,2023,0,"(DP Conway, RD Gaikwad)","(DP Conway, RD Gaikwad)",1.0,0
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,1,1,1,2,2023,0,"(DP Conway, RD Gaikwad)","(DP Conway, RD Gaikwad)",1.0,0
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0,1,2,2023,0,"(DP Conway, RD Gaikwad)","(DP Conway, RD Gaikwad)",1.0,0


In [6]:
df.drop(columns= ['pair', 'prev_pair', 'prev_innings'], inplace=True)
df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,Runs by Batsman,Extras,Total Runs (Ball),Runs Conceded by Bowler,Cumulative Runs by Batsman,Cumulative Runs Conceded by Bowler,Cumulative Team Runs,Year,Match ID,is_wicket
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,0,{},0,0,0,0,0,2023,0,0
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,0,{'legbyes': 1},1,0,0,0,1,2023,0,0
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,0,{},0,0,0,0,1,2023,0,0
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,1,{},1,1,1,1,2,2023,0,0
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,0,{},0,0,0,1,2,2023,0,0


In [7]:
df['Total Wickets'] = df.groupby(['Match ID', 'Innings No.'])['is_wicket'].cumsum()
df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Extras,Total Runs (Ball),Runs Conceded by Bowler,Cumulative Runs by Batsman,Cumulative Runs Conceded by Bowler,Cumulative Team Runs,Year,Match ID,is_wicket,Total Wickets
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,{},0,0,0,0,0,2023,0,0,0
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,{'legbyes': 1},1,0,0,0,1,2023,0,0,0
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,{},0,0,0,0,1,2023,0,0,0
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,{},1,1,1,1,2,2023,0,0,0
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,{},0,0,0,1,2,2023,0,0,0


In [8]:
df.columns

Index(['Venue', 'Date', 'Match No.', 'Innings No.', 'Batting Team',
       'Bowling Team', 'Over', 'Striker', 'Non Striker', 'Bowler',
       'Runs by Batsman', 'Extras', 'Total Runs (Ball)',
       'Runs Conceded by Bowler', 'Cumulative Runs by Batsman',
       'Cumulative Runs Conceded by Bowler', 'Cumulative Team Runs', 'Year',
       'Match ID', 'is_wicket', 'Total Wickets'],
      dtype='object')

In [9]:
def is_legal(extras_dict):
    return not any(k in extras_dict for k in ['wides', 'noballs'])

df['is_legal_delivery'] = df['Extras'].apply(is_legal)
df['Team Balls Count'] = df.groupby(['Match ID', 'Innings No.'])['is_legal_delivery'].cumsum()

In [10]:
def striker_legal(extras_dict):
    return 'wides' not in extras_dict

df['striker_legal_delivery'] = df['Extras'].apply(striker_legal)
df['Striker Balls Count'] = df.groupby(['Match ID', 'Innings No.', 'Striker'])['striker_legal_delivery'].cumsum()
df.drop(columns=['striker_legal_delivery'], inplace=True)

In [11]:
df.drop(columns=['Year'])
df['Date'] = df['Date'].dt.date
df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Cumulative Runs by Batsman,Cumulative Runs Conceded by Bowler,Cumulative Team Runs,Year,Match ID,is_wicket,Total Wickets,is_legal_delivery,Team Balls Count,Striker Balls Count
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0,0,2023,0,0,0,True,1,1
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0,1,2023,0,0,0,True,2,2
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,0,0,1,2023,0,0,0,True,3,1
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,1,1,2,2023,0,0,0,True,4,2
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,0,1,2,2023,0,0,0,True,5,3


In [12]:
df["Team Balls Left"] = 120 - df["Team Balls Count"]
df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Cumulative Runs Conceded by Bowler,Cumulative Team Runs,Year,Match ID,is_wicket,Total Wickets,is_legal_delivery,Team Balls Count,Striker Balls Count,Team Balls Left
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0,2023,0,0,0,True,1,1,119
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,0,1,2023,0,0,0,True,2,2,118
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,0,1,2023,0,0,0,True,3,1,117
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,1,2,2023,0,0,0,True,4,2,116
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,1,2,2023,0,0,0,True,5,3,115


In [13]:
df.columns

Index(['Venue', 'Date', 'Match No.', 'Innings No.', 'Batting Team',
       'Bowling Team', 'Over', 'Striker', 'Non Striker', 'Bowler',
       'Runs by Batsman', 'Extras', 'Total Runs (Ball)',
       'Runs Conceded by Bowler', 'Cumulative Runs by Batsman',
       'Cumulative Runs Conceded by Bowler', 'Cumulative Team Runs', 'Year',
       'Match ID', 'is_wicket', 'Total Wickets', 'is_legal_delivery',
       'Team Balls Count', 'Striker Balls Count', 'Team Balls Left'],
      dtype='object')

In [14]:
df['Batsman Runs on Previous Ball'] = df.groupby(['Match ID', 'Innings No.'])['Runs by Batsman'].shift(1)
df['Total Runs on Previous Ball'] = df.groupby(['Match ID', 'Innings No.'])['Total Runs (Ball)'].shift(1)

df['Batsman Runs on Previous Ball'] = df['Batsman Runs on Previous Ball'].fillna(0).astype(int)
df['Total Runs on Previous Ball'] = df['Total Runs on Previous Ball'].fillna(0).astype(int)

In [15]:
df.head(50)

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Year,Match ID,is_wicket,Total Wickets,is_legal_delivery,Team Balls Count,Striker Balls Count,Team Balls Left,Batsman Runs on Previous Ball,Total Runs on Previous Ball
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,2023,0,0,0,True,1,1,119,0,0
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,2023,0,0,0,True,2,2,118,0,0
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,2023,0,0,0,True,3,1,117,0,1
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,2023,0,0,0,True,4,2,116,0,0
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,2023,0,0,0,True,5,3,115,1,1
5,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.6,DP Conway,RD Gaikwad,Mohammed Shami,...,2023,0,0,0,True,6,4,114,0,0
6,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,1.1,RD Gaikwad,DP Conway,HH Pandya,...,2023,0,0,0,True,7,3,113,0,0
7,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,1.2,RD Gaikwad,DP Conway,HH Pandya,...,2023,0,0,0,True,8,4,112,4,4
8,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,1.3,RD Gaikwad,DP Conway,HH Pandya,...,2023,0,0,0,True,9,5,111,0,0
9,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,1.4,RD Gaikwad,DP Conway,HH Pandya,...,2023,0,0,0,True,10,6,110,4,4


In [16]:
def categorize_over(over):
    over_num = int(str(over).split('.')[0])
    if over_num < 6:
        return 1  # Powerplay
    elif over_num < 16:
        return 2  # Middle
    else:
        return 3  # Death

df['Over_Phase'] = df['Over'].apply(categorize_over)

In [17]:
df['Striker Strike Rate'] = (df['Cumulative Runs by Batsman'] / df['Striker Balls Count']) * 100
df['Striker Strike Rate'] = df['Striker Strike Rate'].fillna(method='ffill').round(2)
df.head(10)

  df['Striker Strike Rate'] = df['Striker Strike Rate'].fillna(method='ffill').round(2)


Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,is_wicket,Total Wickets,is_legal_delivery,Team Balls Count,Striker Balls Count,Team Balls Left,Batsman Runs on Previous Ball,Total Runs on Previous Ball,Over_Phase,Striker Strike Rate
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0,True,1,1,119,0,0,1,0.0
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0,True,2,2,118,0,0,1,0.0
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,0,0,True,3,1,117,0,1,1,0.0
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,0,0,True,4,2,116,0,0,1,50.0
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0,True,5,3,115,1,1,1,0.0
5,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.6,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0,True,6,4,114,0,0,1,0.0
6,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,1.1,RD Gaikwad,DP Conway,HH Pandya,...,0,0,True,7,3,113,0,0,1,166.67
7,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,1.2,RD Gaikwad,DP Conway,HH Pandya,...,0,0,True,8,4,112,4,4,1,125.0
8,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,1.3,RD Gaikwad,DP Conway,HH Pandya,...,0,0,True,9,5,111,0,0,1,180.0
9,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,1.4,RD Gaikwad,DP Conway,HH Pandya,...,0,0,True,10,6,110,4,4,1,166.67


In [18]:
df.rename(columns={"Total Runs (Ball)": "Total runs on that bowl"}, inplace=True)

In [19]:
df['Partnership ID'] = df.groupby(['Match ID', 'Innings No.'])['is_wicket'].cumsum()
df['Partnership Runs'] = df.groupby(['Match ID', 'Innings No.', 'Partnership ID'])['Runs by Batsman'].cumsum()
df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,is_legal_delivery,Team Balls Count,Striker Balls Count,Team Balls Left,Batsman Runs on Previous Ball,Total Runs on Previous Ball,Over_Phase,Striker Strike Rate,Partnership ID,Partnership Runs
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,True,1,1,119,0,0,1,0.0,0,0
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,True,2,2,118,0,0,1,0.0,0,0
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,True,3,1,117,0,1,1,0.0,0,0
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,True,4,2,116,0,0,1,50.0,0,1
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,True,5,3,115,1,1,1,0.0,0,1


In [20]:
df.drop(columns = ['Partnership ID'], inplace=True)

In [21]:
for i in range(len(df)):
    if df.loc[i, 'is_legal_delivery'] and df.loc[i, 'Runs by Batsman'] == 0:
        df.loc[i, 'is_dot'] = 1
    else:
        df.loc[i, 'is_dot'] = 0

df.is_dot = df.is_dot.fillna(0).astype(int)

In [22]:
df['Dot Count'] = df.groupby(['Match ID', 'Innings No.'])['is_dot'].cumsum()
df['Dot Count'] = df['Dot Count'].fillna(0).astype(int)
df

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Team Balls Count,Striker Balls Count,Team Balls Left,Batsman Runs on Previous Ball,Total Runs on Previous Ball,Over_Phase,Striker Strike Rate,Partnership Runs,is_dot,Dot Count
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,1,1,119,0,0,1,0.00,0,1,1
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,2,2,118,0,0,1,0.00,0,1,2
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,3,1,117,0,1,1,0.00,0,1,3
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,4,2,116,0,0,1,50.00,1,0,3
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,5,3,115,1,1,1,0.00,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34961,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.5,SS Iyer,VR Iyer,AK Markram,...,59,2,61,1,1,2,250.00,8,0,24
34962,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.6,VR Iyer,SS Iyer,AK Markram,...,60,24,60,1,1,2,208.33,9,0,24
34963,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.1,VR Iyer,SS Iyer,Shahbaz Ahmed,...,61,25,59,1,1,2,204.00,10,0,24
34964,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.2,SS Iyer,VR Iyer,Shahbaz Ahmed,...,62,3,58,1,1,2,200.00,11,0,24


In [23]:
for i in range(len(df)):
    if df.loc[i, 'is_legal_delivery'] and (df.loc[i, 'Runs by Batsman'] == 4 or df.loc[i, 'Runs by Batsman'] == 6):
        df.loc[i, 'is_boundary'] = 1
    else:
        df.loc[i, 'is_boundary'] = 0

df.is_boundary = df.is_boundary.fillna(0).astype(int)

In [24]:
df['Boundary Count'] = df.groupby(['Match ID', 'Innings No.'])['is_boundary'].cumsum()
df['Boundary Count'] = df['Boundary Count'].fillna(0).astype(int)
df

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Team Balls Left,Batsman Runs on Previous Ball,Total Runs on Previous Ball,Over_Phase,Striker Strike Rate,Partnership Runs,is_dot,Dot Count,is_boundary,Boundary Count
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,119,0,0,1,0.00,0,1,1,0,0
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,118,0,0,1,0.00,0,1,2,0,0
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,117,0,1,1,0.00,0,1,3,0,0
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,116,0,0,1,50.00,1,0,3,0,0
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,115,1,1,1,0.00,1,1,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34961,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.5,SS Iyer,VR Iyer,AK Markram,...,61,1,1,2,250.00,8,0,24,0,16
34962,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.6,VR Iyer,SS Iyer,AK Markram,...,60,1,1,2,208.33,9,0,24,0,16
34963,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.1,VR Iyer,SS Iyer,Shahbaz Ahmed,...,59,1,1,2,204.00,10,0,24,0,16
34964,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.2,SS Iyer,VR Iyer,Shahbaz Ahmed,...,58,1,1,2,200.00,11,0,24,0,16


In [25]:
buffer_sizes = [3, 4, 5, 6]

for size in buffer_sizes:
    runs_buffer = []
    column_name = f'Average Striker Runs {size}'

    for i in range(len(df)):
        if df.loc[i, 'Over'] == 0.0:
            runs_buffer = []
            df.loc[i, column_name] = 0.0
        else:
            if runs_buffer:
                avg = sum(runs_buffer) / len(runs_buffer)
            else:
                avg = 0.0
            df.loc[i, column_name] = avg

        runs_buffer.append(df.loc[i, 'Runs by Batsman'])

        if len(runs_buffer) > size:
            runs_buffer.pop(0)

In [26]:
buffer_sizes = [3, 6]

for size in buffer_sizes:
    runs_buffer = []

    column_name = f'Average Conceded Runs by Bowler {size}'

    for i in range(len(df)):
        if df.loc[i, 'Over'] == 0.0:
            runs_buffer = []
            df.loc[i, column_name] = 0.0
        else:
            if runs_buffer:
                avg = sum(runs_buffer) / len(runs_buffer)
            else:
                avg = 0.0
            df.loc[i, column_name] = avg

        runs_buffer.append(df.loc[i, 'Runs Conceded by Bowler'])

        if len(runs_buffer) > size:
            runs_buffer.pop(0)


In [27]:
df['Average Conceded Runs by Bowler 3'] = df['Average Conceded Runs by Bowler 3'].round(2)
df['Average Conceded Runs by Bowler 6'] = df['Average Conceded Runs by Bowler 6'].round(2)
df['Average Striker Runs 3'] = df['Average Striker Runs 3'].round(2)
df['Average Striker Runs 4'] = df['Average Striker Runs 4'].round(2)
df['Average Striker Runs 5'] = df['Average Striker Runs 5'].round(2)
df['Average Striker Runs 6'] = df['Average Striker Runs 6'].round(2)

In [28]:
df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,is_dot,Dot Count,is_boundary,Boundary Count,Average Striker Runs 3,Average Striker Runs 4,Average Striker Runs 5,Average Striker Runs 6,Average Conceded Runs by Bowler 3,Average Conceded Runs by Bowler 6
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,1,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,1,2,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,1,3,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,0,3,0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,1,4,0,0,0.33,0.25,0.25,0.25,0.33,0.25


In [29]:
for i in range(10796, 10914):
    df.loc[i, 'Bowling Team'] = 'Chennai Super Kings'

In [30]:
first_innings = df[df["Innings No."] == 1]
targets = first_innings.groupby("Match ID")["Cumulative Team Runs"].max().rename("Target")
df = df.merge(targets, on="Match ID", how="left")
df["Target"] = df.apply(lambda row: row["Target"] if row["Innings No."] == 2 else None, axis=1)

df

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Dot Count,is_boundary,Boundary Count,Average Striker Runs 3,Average Striker Runs 4,Average Striker Runs 5,Average Striker Runs 6,Average Conceded Runs by Bowler 3,Average Conceded Runs by Bowler 6,Target
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,1,0,0,0.00,0.00,0.00,0.00,0.00,0.00,
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,2,0,0,0.00,0.00,0.00,0.00,0.00,0.00,
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,3,0,0,0.00,0.00,0.00,0.00,0.00,0.00,
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,3,0,0,0.00,0.00,0.00,0.00,0.00,0.00,
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,4,0,0,0.33,0.25,0.25,0.25,0.33,0.25,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34961,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.5,SS Iyer,VR Iyer,AK Markram,...,24,0,16,0.33,0.75,1.40,1.17,0.33,1.17,113.0
34962,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.6,VR Iyer,SS Iyer,AK Markram,...,24,0,16,0.67,0.50,0.80,1.33,0.67,1.33,113.0
34963,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.1,VR Iyer,SS Iyer,Shahbaz Ahmed,...,24,0,16,1.00,0.75,0.60,0.83,1.00,0.83,113.0
34964,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.2,SS Iyer,VR Iyer,Shahbaz Ahmed,...,24,0,16,1.00,1.00,0.80,0.67,1.00,0.67,113.0


In [31]:
# Your original target dictionary
Runs = {
    "Narendra Modi Stadium, Ahmedabad": 200,
    "Eden Gardens, Kolkata": 180,
    "Wankhede Stadium, Mumbai": 180,
    "Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh": 185,
    "Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow": 170,
    "Rajiv Gandhi International Stadium, Uppal, Hyderabad": 200,
    "M Chinnaswamy Stadium, Bengaluru": 210,
    "MA Chidambaram Stadium, Chepauk, Chennai": 205,
    "Arun Jaitley Stadium, Delhi": 170,
    "Barsapara Cricket Stadium, Guwahati": 195,
    "Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam": 190,
    "Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur": 175,
    "Sawai Mansingh Stadium, Jaipur": 165,
    "Himachal Pradesh Cricket Association Stadium, Dharamsala": 200
}

# Reset
venue_history = {}      # Tracks past actual scores only
match_targets = {}      # Final computed target per match

# Get unique 1st innings Match IDs in order
match_ids = df[df["Innings No."] == 1]["Match ID"].drop_duplicates()

for match_id in match_ids:
    match_df = df[(df["Match ID"] == match_id) & (df["Innings No."] == 1)]
    venue = match_df["Venue"].iloc[0]
    final_score = match_df["Cumulative Team Runs"].max()

    # Get past scores and original target
    past_scores = venue_history.get(venue, [])
    base = []

    # Always include the static target if it exists
    if venue in Runs:
        base.append(Runs[venue])

    base.extend(past_scores)

    # Determine new target
    if base:
        target = sum(base) / len(base)
        math.ceil(target)
    else:
        target = None

    # Store target and update history
    match_targets[match_id] = target
    venue_history.setdefault(venue, []).append(final_score)

# Assign to DataFrame
for match_id, target in match_targets.items():
    df.loc[(df["Match ID"] == match_id) & (df["Innings No."] == 1), "Target"] = target


In [32]:
venue_name = "MA Chidambaram Stadium, Chepauk, Chennai"
matches_by_team = df[(df["Venue"] == venue_name) & (df['Innings No.'] == 1)]
matches_by_team

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Dot Count,is_boundary,Boundary Count,Average Striker Runs 3,Average Striker Runs 4,Average Striker Runs 5,Average Striker Runs 6,Average Conceded Runs by Bowler 3,Average Conceded Runs by Bowler 6,Target
1196,"MA Chidambaram Stadium, Chepauk, Chennai",2023-04-03,6,1,Chennai Super Kings,Lucknow Super Giants,0.1,RD Gaikwad,DP Conway,KR Mayers,...,0,0,0,3.33,4.00,4.4,3.67,3.67,3.83,205.000000
1197,"MA Chidambaram Stadium, Chepauk, Chennai",2023-04-03,6,1,Chennai Super Kings,Lucknow Super Giants,0.2,DP Conway,RD Gaikwad,KR Mayers,...,0,0,0,2.33,2.75,3.4,3.83,2.67,4.00,205.000000
1198,"MA Chidambaram Stadium, Chepauk, Chennai",2023-04-03,6,1,Chennai Super Kings,Lucknow Super Giants,0.3,DP Conway,RD Gaikwad,KR Mayers,...,1,0,0,2.33,1.75,2.2,2.83,2.67,3.17,205.000000
1199,"MA Chidambaram Stadium, Chepauk, Chennai",2023-04-03,6,1,Chennai Super Kings,Lucknow Super Giants,0.4,DP Conway,RD Gaikwad,KR Mayers,...,1,0,0,0.33,1.75,1.4,1.83,0.67,2.17,205.000000
1200,"MA Chidambaram Stadium, Chepauk, Chennai",2023-04-03,6,1,Chennai Super Kings,Lucknow Super Giants,0.5,RD Gaikwad,DP Conway,KR Mayers,...,1,0,0,0.33,0.50,1.6,1.33,0.67,1.67,205.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34894,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,1,Sunrisers Hyderabad,Kolkata Knight Riders,17.5,JD Unadkat,PJ Cummins,SP Narine,...,52,0,11,1.33,1.25,1.2,2.00,1.33,2.00,175.055556
34895,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,1,Sunrisers Hyderabad,Kolkata Knight Riders,17.6,B Kumar,PJ Cummins,SP Narine,...,53,0,11,1.00,1.00,1.0,1.00,1.00,1.00,175.055556
34896,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,1,Sunrisers Hyderabad,Kolkata Knight Riders,18.1,PJ Cummins,B Kumar,AD Russell,...,54,0,11,0.33,0.75,0.8,0.83,0.33,0.83,175.055556
34897,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,1,Sunrisers Hyderabad,Kolkata Knight Riders,18.2,PJ Cummins,B Kumar,AD Russell,...,55,0,11,0.00,0.25,0.6,0.67,0.00,0.67,175.055556


In [33]:
df['wicket_remaining'] = 10 - df['Total Wickets']
df['Resources Remaining'] = df['wicket_remaining'] / 10.0
df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Boundary Count,Average Striker Runs 3,Average Striker Runs 4,Average Striker Runs 5,Average Striker Runs 6,Average Conceded Runs by Bowler 3,Average Conceded Runs by Bowler 6,Target,wicket_remaining,Resources Remaining
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,10,1.0
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,10,1.0
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,10,1.0
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,0,0.0,0.0,0.0,0.0,0.0,0.0,200.0,10,1.0
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,0,0.33,0.25,0.25,0.25,0.33,0.25,200.0,10,1.0


In [34]:
bowling_styles = {
    "B Kumar": "Right-arm medium",
    "PJ Cummins": "Right-arm fast",
    "T Natarajan": "Left-arm medium",
    "Shahbaz Ahmed": "Left-arm orthodox spin",
    "JD Unadkat": "Left-arm medium",
    "AK Markram": "Right-arm off-spin",
    "MA Starc": "Left-arm fast",
    "VG Arora": "Right-arm medium-fast",
    "Harshit Rana": "Right-arm fast-medium",
    "SP Narine": "Right-arm off-spin",
    "AD Russell": "Right-arm fast-medium",
    "CV Varun": "Right-arm leg-spin",
    "Abhishek Sharma": "Left-arm orthodox spin",
    "TA Boult": "Left-arm fast-medium",
    "R Ashwin": "Right-arm off-spin",
    "Sandeep Sharma": "Right-arm medium",
    "Avesh Khan": "Right-arm fast-medium",
    "YS Chahal": "Right-arm leg-spin",
    "Swapnil Singh": "Left-arm orthodox spin",
    "Mohammed Siraj": "Right-arm fast",
    "Yash Dayal": "Left-arm medium-fast",
    "LH Ferguson": "Right-arm fast",
    "KV Sharma": "Right-arm leg-spin",
    "C Green": "Right-arm medium-fast",
    "V Viyaskanth": "Right-arm leg-spin",
    "TM Head": "Right-arm off-spin",
    "Nithish Kumar Reddy": "Right-arm medium-fast",
    "Arshdeep Singh": "Left-arm medium-fast",
    "R Dhawan": "Left-arm medium",
    "HV Patel": "Right-arm medium",
    "RD Chahar": "Right-arm leg-spin",
    "Harpreet Brar": "Left-arm orthodox spin",
    "Shashank Singh": "Right-arm medium",
    "Atharva Taide": "Left-arm orthodox spin",
    "GJ Maxwell": "Right-arm off-spin",
    "TU Deshpande": "Right-arm medium-fast",
    "SN Thakur": "Right-arm medium-fast",
    "M Theekshana": "Right-arm off-spin",
    "MJ Santner": "Left-arm orthodox spin",
    "RA Jadeja": "Left-arm orthodox spin",
    "Simarjeet Singh": "Right-arm medium-fast",
    "N Thushara": "Left-arm fast-medium",
    "Arjun Tendulkar": "Left-arm medium-fast",
    "A Kamboj": "Right-arm off-spin",
    "PP Chawla": "Right-arm leg-spin",
    "N Wadhera": "Right-arm off-spin",
    "HH Pandya": "Right-arm fast-medium",
    "Naman Dhir": "Right-arm medium",
    "R Shepherd": "Right-arm fast-medium",
    "Arshad Khan": "Left-arm medium-fast",
    "MJ Henry": "Right-arm fast-medium",
    "KH Pandya": "Left-arm orthodox spin",
    "Mohsin Khan": "Left-arm medium-fast",
    "Naveen-ul-Haq": "Right-arm fast-medium",
    "Ravi Bishnoi": "Right-arm leg-spin",
    "SM Curran": "Left-arm fast-medium",
    "NT Ellis": "Right-arm fast-medium",
    "Yudhvir Singh": "Right-arm medium-fast",
    "DJ Hooda": "Right-arm off-spin",
    "I Sharma": "Right-arm fast-medium",
    "KK Ahmed": "Left-arm medium-fast",
    "AR Patel": "Left-arm orthodox spin",
    "Mukesh Kumar": "Right-arm medium-fast",
    "Kuldeep Yadav": "Left-arm chinaman",
    "T Stubbs": "Right-arm off-spin",
    "Gulbadin Naib": "Right-arm medium",
    "Rasikh Salam": "Right-arm fast-medium",
    "N Burger": "Left-arm fast-medium",
    "WG Jacks": "Right-arm off-spin",
    "JJ Bumrah": "Right-arm fast",
    "UT Yadav": "Right-arm fast",
    "S Sandeep Warrier": "Right-arm fast-medium",
    "Kartik Tyagi": "Right-arm fast",
    "Noor Ahmad": "Left-arm wrist-spin",
    "Rashid Khan": "Right-arm leg-spin",
    "MM Sharma": "Right-arm fast-medium",
    "DJ Mitchell": "Right-arm medium",
    "V Kaverappa": "Right-arm medium-fast",
    "LS Livingstone": "Right-arm off-spin",
    "K Gowtham": "Right-arm off-spin",
    "Yash Thakur": "Right-arm medium-fast",
    "A Badoni": "Right-arm off-spin",
    "R Parag": "Right-arm leg-spin",
    "M Jansen": "Left-arm fast-medium",
    "K Rabada": "Right-arm fast",
    "MP Stoinis": "Right-arm medium-fast",
    "RJ Gleeson": "Right-arm fast",
    "Vijaykumar Vyshak": "Right-arm fast-medium",
    "J Little": "Left-arm fast-medium",
    "MJ Suthar": "Left-arm orthodox spin",
    "G Coetzee": "Right-arm fast",
    "DL Chahar": "Right-arm medium",
    "Mustafizur Rahman": "Left-arm fast-medium",
    "MM Ali": "Right-arm off-spin",
    "S Dube": "Right-arm medium",
    "Mohammad Nabi": "Right-arm off-spin",
    "MP Yadav": "Right-arm leg-spin",
    "LB Williams": "Right-arm fast-medium",
    "Azmatullah Omarzai": "Right-arm medium-fast",
    "R Sai Kishore": "Left-arm orthodox spin",
    "M Pathirana": "Right-arm fast",
    "L Wood": "Left-arm fast-medium",
    "A Mishra": "Right-arm leg-spin",
    "PVD Chameera": "Right-arm fast",
    "AS Roy": "Right-arm off-spin",
    "Ramandeep Singh": "Right-arm medium-fast",
    "M Markande": "Right-arm leg-spin",
    "M Shahrukh Khan": "Right-arm off-spin",
    "A Nortje": "Right-arm fast",
    "Tilak Varma": "Right-arm off-spin",
    "Suyash Sharma": "Right-arm leg-spin",
    "Washington Sundar": "Right-arm off-spin",
    "Lalit Yadav": "Right-arm off-spin",
    "Akash Madhwal": "Right-arm medium-fast",
    "S Gopal": "Right-arm leg-spin",
    "SH Johnson": "Left-arm fast",
    "KR Sen": "Right-arm fast",
    "RJW Topley": "Left-arm fast-medium",
    "MK Lomror": "Right-arm off-spin",
    "S Joseph": "Right-arm fast",
    "Arshad Khan (2)": "Left-arm medium-fast",
    "KA Maharaj": "Left-arm orthodox spin",
    "Akash Deep": "Right-arm-fast",
    "R Ravindra": "Left-arm orthodox spin",
    "M Siddharth": "Left-arm orthodox spin",
    "DG Nalkande": "Right-arm medium-fast",
    "JA Richardson": "Right-arm fast",
    "Mayank Dagar": "Left-arm orthodox spin",
    "H Sharma": "Right-arm medium-fast",
    "Mukesh Choudhary": "Left-arm medium-fast",
    "Sikandar Raza": "Right-arm off-spin",
    "VR Iyer": "Left-arm medium-fast",
    "Sumit Kumar": "Right-arm medium",
    "MR Marsh": "Right-arm medium",
    "KT Maphaka": "Left-arm fast",
    "AS Joseph": "Right-arm fast",
    "Umran Malik": "Right-arm fast",
    "SZ Mulani": "Left-arm orthodox spin",
    "Mohammed Shami": "Right-arm fast",
    "JP Behrendorff": "Left-arm fast-medium",
    "CJ Jordan": "Right-arm fast-medium",
    "K Kartikeya": "Left-arm orthodox spin",
    "HR Shokeen": "Right-arm off-spin",
    "WD Parnell": "Left-arm fast-medium",
    "MG Bracewell": "Right-arm off-spin",
    "Vivrant Sharma": "Left-arm orthodox spin",
    "C Sakariya": "Left-arm medium-fast",
    "N Rana": "Right-arm off-spin",
    "Navdeep Saini": "Right-arm fast",
    "A Zampa": "Right-arm leg-spin",
    "GD Phillips": "Right-arm off-spin",
    "Fazalhaq Farooqi": "Left-arm fast-medium",
    "R Tewatia": "Left-arm leg-spin",
    "KM Asif": "Right-arm fast-medium",
    "P Dubey": "Right-arm leg-spin",
    "KR Mayers": "Right-arm medium-fast",
    "JE Root": "Right-arm off-spin",
    "JR Hazlewood": "Right-arm fast",
    "PWH de Silva": "Left-arm leg-spin",
    "K Yadav": "Left-arm wrist-spin",
    "M Ashwin": "Right-arm leg-spin",
    "OC McCoy": "Left-arm fast-medium",
    "JC Archer": "Right-arm fast",
    "R Goyal": "Right-arm leg-spin",
    "JO Holder": "Right-arm medium-fast",
    "Akash Singh": "Left-arm medium-fast",
    "RP Meredith": "Right-arm fast",
    "AJ Hosein": "Left-arm orthodox spin",
    "Gurnoor Brar": "Right-arm medium-fast",
    "DJ Willey": "Left-arm fast-medium",
    "D Wiese": "Right-arm fast-medium",
    "K Khejroliya": "Left-arm medium-fast",
    "J Yadav": "Right-arm off-spin",
    "MW Short": "Right-arm off-spin",
    "D Jansen": "Left-arm fast",
    "MA Wood": "Right-arm fast",
    "SSB Magala": "Right-arm fast-medium",
    "Mohit Rathee": "Right-arm leg-spin",
    "D Pretorius": "Right-arm medium-fast",
    "R Powell": "Right-arm medium-fast",
    "AU Rashid": "Right-arm leg-spin",
    "TG Southee": "Right-arm fast-medium",
    "BA Stokes": "Right-arm fast-medium",
    "RS Hangargekar": "Right-arm fast",
    "NA Saini": "Right-arm fast"
}

In [35]:
df["Bowling Style"] = df["Bowler"].map(bowling_styles)
df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Average Striker Runs 3,Average Striker Runs 4,Average Striker Runs 5,Average Striker Runs 6,Average Conceded Runs by Bowler 3,Average Conceded Runs by Bowler 6,Target,wicket_remaining,Resources Remaining,Bowling Style
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,0.0,0.0,0.0,0.0,0.0,0.0,200.0,10,1.0,Right-arm fast
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,0.0,0.0,0.0,0.0,0.0,0.0,200.0,10,1.0,Right-arm fast
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,0.0,0.0,0.0,0.0,0.0,0.0,200.0,10,1.0,Right-arm fast
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,0.0,0.0,0.0,0.0,0.0,0.0,200.0,10,1.0,Right-arm fast
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,0.33,0.25,0.25,0.25,0.33,0.25,200.0,10,1.0,Right-arm fast


In [36]:
from sklearn.preprocessing import LabelEncoder

In [37]:
categorical_cols = [col for col in df.columns if df[col].dtype == 'object']

encoders = {}

for col in categorical_cols:
    if col not in ['Date', 'Extras']:
        encoder = LabelEncoder()
        new_col = col + " ID"
        df[new_col] = encoder.fit_transform(df[col])
        encoders[col] = encoder

df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,wicket_remaining,Resources Remaining,Bowling Style,Venue ID,Batting Team ID,Bowling Team ID,Striker ID,Non Striker ID,Bowler ID,Bowling Style ID
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,10,1.0,Right-arm fast,9,0,2,55,161,102,8
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,10,1.0,Right-arm fast,9,0,2,55,161,102,8
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,10,1.0,Right-arm fast,9,0,2,167,53,102,8
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,10,1.0,Right-arm fast,9,0,2,167,53,102,8
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,10,1.0,Right-arm fast,9,0,2,55,161,102,8


In [38]:
df["is_legal_delivery"] = df["is_legal_delivery"].map({True: 1, False: 0})
df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,wicket_remaining,Resources Remaining,Bowling Style,Venue ID,Batting Team ID,Bowling Team ID,Striker ID,Non Striker ID,Bowler ID,Bowling Style ID
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,10,1.0,Right-arm fast,9,0,2,55,161,102,8
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,10,1.0,Right-arm fast,9,0,2,55,161,102,8
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,10,1.0,Right-arm fast,9,0,2,167,53,102,8
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,10,1.0,Right-arm fast,9,0,2,167,53,102,8
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,10,1.0,Right-arm fast,9,0,2,55,161,102,8


In [39]:
df.columns

Index(['Venue', 'Date', 'Match No.', 'Innings No.', 'Batting Team',
       'Bowling Team', 'Over', 'Striker', 'Non Striker', 'Bowler',
       'Runs by Batsman', 'Extras', 'Total runs on that bowl',
       'Runs Conceded by Bowler', 'Cumulative Runs by Batsman',
       'Cumulative Runs Conceded by Bowler', 'Cumulative Team Runs', 'Year',
       'Match ID', 'is_wicket', 'Total Wickets', 'is_legal_delivery',
       'Team Balls Count', 'Striker Balls Count', 'Team Balls Left',
       'Batsman Runs on Previous Ball', 'Total Runs on Previous Ball',
       'Over_Phase', 'Striker Strike Rate', 'Partnership Runs', 'is_dot',
       'Dot Count', 'is_boundary', 'Boundary Count', 'Average Striker Runs 3',
       'Average Striker Runs 4', 'Average Striker Runs 5',
       'Average Striker Runs 6', 'Average Conceded Runs by Bowler 3',
       'Average Conceded Runs by Bowler 6', 'Target', 'wicket_remaining',
       'Resources Remaining', 'Bowling Style', 'Venue ID', 'Batting Team ID',
       'Bowling T

In [40]:
def adjust_target_and_compute_crr(row):
    target = row["Target"]
    balls = row["Team Balls Count"]
    runs = row["Cumulative Team Runs"]
    extras = row["Extras"]

    # Adjust target if this is the first ball and it's an illegal delivery
    if balls == 0 and isinstance(extras, dict):
        for k, v in extras.items():
            if k in {"wides", "noballs", "legbyes", "byes"}:  # you can adjust keys as needed
                target -= v

    # Calculate CRR per ball
    crr = runs / balls if balls > 0 else 0

    return pd.Series({"Adjusted Target": target, "Current Run Rate": crr})

df[["Target", "Current Run Rate"]] = df.apply(adjust_target_and_compute_crr, axis=1)


In [41]:
numeric_df = df.select_dtypes(include=[np.number])
np.isinf(numeric_df).any()

Match No.                             False
Innings No.                           False
Over                                  False
Runs by Batsman                       False
Total runs on that bowl               False
Runs Conceded by Bowler               False
Cumulative Runs by Batsman            False
Cumulative Runs Conceded by Bowler    False
Cumulative Team Runs                  False
Year                                  False
Match ID                              False
is_wicket                             False
Total Wickets                         False
is_legal_delivery                     False
Team Balls Count                      False
Striker Balls Count                   False
Team Balls Left                       False
Batsman Runs on Previous Ball         False
Total Runs on Previous Ball           False
Over_Phase                            False
Striker Strike Rate                   False
Partnership Runs                      False
is_dot                          

In [42]:
df

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Resources Remaining,Bowling Style,Venue ID,Batting Team ID,Bowling Team ID,Striker ID,Non Striker ID,Bowler ID,Bowling Style ID,Current Run Rate
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,1.0,Right-arm fast,9,0,2,55,161,102,8,0.000000
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,1.0,Right-arm fast,9,0,2,55,161,102,8,0.500000
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,1.0,Right-arm fast,9,0,2,167,53,102,8,0.333333
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,1.0,Right-arm fast,9,0,2,167,53,102,8,0.500000
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,1.0,Right-arm fast,9,0,2,55,161,102,8,0.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34961,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.5,SS Iyer,VR Iyer,AK Markram,...,0.8,Right-arm off-spin,7,3,10,194,215,7,13,1.864407
34962,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.6,VR Iyer,SS Iyer,AK Markram,...,0.8,Right-arm off-spin,7,3,10,223,186,7,13,1.850000
34963,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.1,VR Iyer,SS Iyer,Shahbaz Ahmed,...,0.8,Left-arm orthodox spin,7,3,10,223,186,156,6,1.836066
34964,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.2,SS Iyer,VR Iyer,Shahbaz Ahmed,...,0.8,Left-arm orthodox spin,7,3,10,194,215,156,6,1.822581


In [43]:
df['Req. Run Rate'] = (df['Target'] - df['Cumulative Team Runs']) / df['Team Balls Left']
df['Req. Run Rate'] = df['Req. Run Rate'].clip(lower=-150, upper=150)
df.head()

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Bowling Style,Venue ID,Batting Team ID,Bowling Team ID,Striker ID,Non Striker ID,Bowler ID,Bowling Style ID,Current Run Rate,Req. Run Rate
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,Right-arm fast,9,0,2,55,161,102,8,0.0,1.680672
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,Right-arm fast,9,0,2,55,161,102,8,0.5,1.686441
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,Right-arm fast,9,0,2,167,53,102,8,0.333333,1.700855
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,Right-arm fast,9,0,2,167,53,102,8,0.5,1.706897
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,Right-arm fast,9,0,2,55,161,102,8,0.4,1.721739


In [44]:
df["Pressure Index"] = df.apply(
    lambda row: (
        # α: Based on Required Run Rate
        (0.4 if row["Req. Run Rate"] <= 10 else
         (0.48 if row["Req. Run Rate"] <= 11 else
          (0.56 if row["Req. Run Rate"] <= 12 else
           (0.64 if row["Req. Run Rate"] <= 13 else
            (0.72 if row["Req. Run Rate"] <= 14 else
             (0.80 if row["Req. Run Rate"] <= 15 else
              (0.88 if row["Req. Run Rate"] <= 16 else
               (0.92 if row["Req. Run Rate"] <= 17 else 0.96)))))))) *
        ((row["Req. Run Rate"] / (row["Current Run Rate"] if row["Current Run Rate"] != 0 else 0.01)) *
         (row["Team Balls Count"] / 120))

        +

        # β: Based on Total Wickets
        ((0.3 if row["Total Wickets"] <= 4 else
          (0.45 if row["Total Wickets"] == 5 else
           (0.6 if row["Total Wickets"] == 6 else
            (0.75 if row["Total Wickets"] == 7 else
             (0.9 if row["Total Wickets"] >= 8 else 0.3))))) *
         (row["Total Wickets"] / 10))

        +

        # γ: Based on Dot Count
        ((0.2 if row['Dot Count'] < 20 else
          (0.3 if row['Dot Count'] < 25 else
           (0.45 if row['Dot Count'] < 30 else
            (0.6 if row['Dot Count'] < 35 else
             (0.75 if row['Dot Count'] < 40 else 0.9))))) * row['Dot Count'])
    ),
    axis=1
)


In [45]:
df

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Venue ID,Batting Team ID,Bowling Team ID,Striker ID,Non Striker ID,Bowler ID,Bowling Style ID,Current Run Rate,Req. Run Rate,Pressure Index
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,9,0,2,55,161,102,8,0.000000,1.680672,0.760224
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,9,0,2,55,161,102,8,0.500000,1.686441,0.422486
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,9,0,2,167,53,102,8,0.333333,1.700855,0.651026
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,9,0,2,167,53,102,8,0.500000,1.706897,0.645517
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,9,0,2,55,161,102,8,0.400000,1.721739,0.871739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34961,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.5,SS Iyer,VR Iyer,AK Markram,...,7,3,10,194,215,7,13,1.864407,0.049180,7.265188
34962,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.6,VR Iyer,SS Iyer,AK Markram,...,7,3,10,223,186,7,13,1.850000,0.033333,7.263604
34963,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.1,VR Iyer,SS Iyer,Shahbaz Ahmed,...,7,3,10,223,186,156,6,1.836066,0.016949,7.261877
34964,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.2,SS Iyer,VR Iyer,Shahbaz Ahmed,...,7,3,10,194,215,156,6,1.822581,0.000000,7.260000


In [46]:
df['Previous Average Striker Runs 3'] = df.groupby(['Match ID', 'Innings No.'])['Average Striker Runs 3'].shift(1)
df['Previous Average Striker Runs 3'] = df['Previous Average Striker Runs 3'].fillna(0)

df['Previous Average Striker Runs 4'] = df.groupby(['Match ID', 'Innings No.'])['Average Striker Runs 4'].shift(1)
df['Previous Average Striker Runs 4'] = df['Previous Average Striker Runs 4'].fillna(0)

df['Previous Average Striker Runs 5'] = df.groupby(['Match ID', 'Innings No.'])['Average Striker Runs 5'].shift(1)
df['Previous Average Striker Runs 5'] = df['Previous Average Striker Runs 5'].fillna(0)

df['Previous Average Striker Runs 6'] = df.groupby(['Match ID', 'Innings No.'])['Average Striker Runs 6'].shift(1)
df['Previous Average Striker Runs 6'] = df['Previous Average Striker Runs 6'].fillna(0)

df['Previous Average Conceded Runs by Bowler 3'] = df.groupby(['Match ID', 'Innings No.'])['Average Conceded Runs by Bowler 3'].shift(1)
df['Previous Average Conceded Runs by Bowler 3'] = df['Previous Average Conceded Runs by Bowler 3'].fillna(0)

df['Previous Average Conceded Runs by Bowler 6'] = df.groupby(['Match ID', 'Innings No.'])['Average Conceded Runs by Bowler 6'].shift(1)
df['Previous Average Conceded Runs by Bowler 6'] = df['Previous Average Conceded Runs by Bowler 6'].fillna(0)

df['Previous Current Run Rate'] = df.groupby(['Match ID', 'Innings No.'])['Current Run Rate'].shift(1)
df['Previous Current Run Rate'] = df['Previous Current Run Rate'].fillna(0)

df['Previous Pressure Index'] = df.groupby(['Match ID', 'Innings No.'])['Pressure Index'].shift(1)
df['Previous Pressure Index'] = df['Previous Pressure Index'].fillna(2)

df['Previous Wickets Remaining'] = df.groupby(['Match ID', 'Innings No.'])['wicket_remaining'].shift(1)
df['Previous Wickets Remaining'] = df['Previous Wickets Remaining'].fillna(10)

df['Run Conceded by bowler on prev Bowl'] = df.groupby(['Match ID', 'Innings No.'])['Runs Conceded by Bowler'].shift(1)
df['Run Conceded by bowler on prev Bowl'] = df['Run Conceded by bowler on prev Bowl'].fillna(0)

In [47]:
legal_deliveries = df[df['is_legal_delivery'] == 1]

economy = legal_deliveries.groupby(['Match ID', 'Innings No.', 'Bowler']).agg(
    Total_Runs_Conceded=('Runs Conceded by Bowler', 'sum'),
    Legal_Balls=('is_legal_delivery', 'count')
).reset_index()

economy['Bowler Economy'] = economy['Total_Runs_Conceded'] / (economy['Legal_Balls'] / 6)

# Step 4: Merge back into df
df = df.merge(economy[['Match ID', 'Innings No.', 'Bowler', 'Bowler Economy']],
              on=['Match ID', 'Innings No.', 'Bowler'], how='left')

In [48]:
df['Bowler Economy till prev ball'] = df.groupby(['Match ID', 'Innings No.'])['Bowler Economy'].shift(1)
df['Bowler Economy till prev ball'] = df['Bowler Economy till prev ball'].fillna(0)

In [49]:
df

Unnamed: 0,Venue,Date,Match No.,Innings No.,Batting Team,Bowling Team,Over,Striker,Non Striker,Bowler,...,Previous Average Striker Runs 5,Previous Average Striker Runs 6,Previous Average Conceded Runs by Bowler 3,Previous Average Conceded Runs by Bowler 6,Previous Current Run Rate,Previous Pressure Index,Previous Wickets Remaining,Run Conceded by bowler on prev Bowl,Bowler Economy,Bowler Economy till prev ball
0,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.1,DP Conway,RD Gaikwad,Mohammed Shami,...,0.0,0.00,0.00,0.00,0.000000,2.000000,10.0,0.0,7.0,0.0
1,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.2,DP Conway,RD Gaikwad,Mohammed Shami,...,0.0,0.00,0.00,0.00,0.000000,0.760224,10.0,0.0,7.0,7.0
2,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.3,RD Gaikwad,DP Conway,Mohammed Shami,...,0.0,0.00,0.00,0.00,0.500000,0.422486,10.0,0.0,7.0,7.0
3,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.4,RD Gaikwad,DP Conway,Mohammed Shami,...,0.0,0.00,0.00,0.00,0.333333,0.651026,10.0,0.0,7.0,7.0
4,"Narendra Modi Stadium, Ahmedabad",2023-03-31,1,1,Chennai Super Kings,Gujarat Titans,0.5,DP Conway,RD Gaikwad,Mohammed Shami,...,0.0,0.00,0.00,0.00,0.500000,0.645517,10.0,1.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34961,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.5,SS Iyer,VR Iyer,AK Markram,...,1.2,2.00,0.67,2.00,1.879310,7.266637,8.0,1.0,5.0,5.0
34962,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.6,VR Iyer,SS Iyer,AK Markram,...,1.4,1.17,0.33,1.17,1.864407,7.265188,8.0,1.0,5.0,5.0
34963,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.1,VR Iyer,SS Iyer,Shahbaz Ahmed,...,0.8,1.33,0.67,1.33,1.850000,7.263604,8.0,1.0,8.8,5.0
34964,"MA Chidambaram Stadium, Chepauk, Chennai",2024-05-26,73,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.2,SS Iyer,VR Iyer,Shahbaz Ahmed,...,0.6,0.83,1.00,0.83,1.836066,7.261877,8.0,1.0,8.8,8.8


In [50]:
numeric_df = df.select_dtypes(include=[np.number])
np.isinf(numeric_df).any()

Match No.                                     False
Innings No.                                   False
Over                                          False
Runs by Batsman                               False
Total runs on that bowl                       False
Runs Conceded by Bowler                       False
Cumulative Runs by Batsman                    False
Cumulative Runs Conceded by Bowler            False
Cumulative Team Runs                          False
Year                                          False
Match ID                                      False
is_wicket                                     False
Total Wickets                                 False
is_legal_delivery                             False
Team Balls Count                              False
Striker Balls Count                           False
Team Balls Left                               False
Batsman Runs on Previous Ball                 False
Total Runs on Previous Ball                   False
Over_Phase  

In [51]:
columns_with_nulls = df.columns[df.isnull().any()].tolist()
columns_with_nulls

[]

In [52]:
df.to_csv("Antim_Dataset.csv", index = False)