## ML Assignment: IPL Score Prediction with Run Rate Feature

### Data Preprocessing
- Loading the dataset  
- Handling missing values  
- Feature engineering (Run Rate addition)  

### Exploratory Data Analysis (EDA)
- Understanding key statistics  
- Visualizing distributions and trends  

### Feature Engineering
- Adding Run Rate  
- Creating additional relevant features  

### Model Selection & Training
- Splitting data into train and test sets  
- Choosing the right ML model  

### Model Evaluation
- Checking model performance  
- Analyzing errors  

### Predictions & Conclusion
- Making IPL score predictions  
- Key takeaways and improvements  


### Importing the Libraries


In [3]:
import pandas as pd
import numpy as np
import warnings 
import math
warnings.filterwarnings('ignore')

### Loading the data

In [5]:
df= pd.read_csv("IPL_ball_by_ball_2024.csv")

### Feature Engineering 

In [7]:
df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,...,1,,,,1.0,,,,,


In [8]:
df.tail(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
260919,1426312,2024,2024-05-26,"MA Chidambaram Stadium, Chepauk, Chennai",2,10.3,Kolkata Knight Riders,Sunrisers Hyderabad,VR Iyer,SS Iyer,...,0,,,,,,,,,


#### Subsetting only playing teams

In [10]:
df['batting_team'].unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Deccan Chargers', 'Mumbai Indians',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

In [11]:
main_teams=['Kolkata Knight Riders',
       'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians','Sunrisers Hyderabad',
       'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru']

In [12]:
df['batting_team'].isin(main_teams)

0         True
1         True
2         True
3         True
4         True
          ... 
260915    True
260916    True
260917    True
260918    True
260919    True
Name: batting_team, Length: 260920, dtype: bool

In [13]:
df[df['batting_team'].isin(main_teams)]

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,...,1,,,,1.0,,,,,
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,1,1.0,,,,,,,,
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260915,1426312,2024,2024-05-26,"MA Chidambaram Stadium, Chepauk, Chennai",2,9.5,Kolkata Knight Riders,Sunrisers Hyderabad,SS Iyer,VR Iyer,...,0,,,,,,,,,
260916,1426312,2024,2024-05-26,"MA Chidambaram Stadium, Chepauk, Chennai",2,9.6,Kolkata Knight Riders,Sunrisers Hyderabad,VR Iyer,SS Iyer,...,0,,,,,,,,,
260917,1426312,2024,2024-05-26,"MA Chidambaram Stadium, Chepauk, Chennai",2,10.1,Kolkata Knight Riders,Sunrisers Hyderabad,VR Iyer,SS Iyer,...,0,,,,,,,,,
260918,1426312,2024,2024-05-26,"MA Chidambaram Stadium, Chepauk, Chennai",2,10.2,Kolkata Knight Riders,Sunrisers Hyderabad,SS Iyer,VR Iyer,...,0,,,,,,,,,


In [14]:
df=df[df['batting_team'].isin(main_teams)]

In [15]:
df=df[df['bowling_team'].isin(main_teams)]

In [16]:
df['batting_team'].unique()

array(['Chennai Super Kings', 'Mumbai Indians', 'Kolkata Knight Riders',
       'Rajasthan Royals', 'Sunrisers Hyderabad', 'Delhi Capitals',
       'Punjab Kings', 'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

In [17]:
df['bowling_team'].unique()

array(['Mumbai Indians', 'Chennai Super Kings', 'Kolkata Knight Riders',
       'Rajasthan Royals', 'Sunrisers Hyderabad', 'Delhi Capitals',
       'Punjab Kings', 'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

In [18]:
df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [19]:
df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
1624,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,0.1,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,1,1.0,,,,,,,,


#### Subletting the data with required features

In [21]:
df.sort_values(['match_id', 'innings'], ascending=[True, True])

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
1624,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,0.1,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,1,1.0,,,,,,,,
1625,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,0.2,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,0,,,,,,,,,
1626,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,0.3,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,0,,,,,,,,,
1627,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,0.4,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,0,,,,,,,,,
1628,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,0.5,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260915,1426312,2024,2024-05-26,"MA Chidambaram Stadium, Chepauk, Chennai",2,9.5,Kolkata Knight Riders,Sunrisers Hyderabad,SS Iyer,VR Iyer,...,0,,,,,,,,,
260916,1426312,2024,2024-05-26,"MA Chidambaram Stadium, Chepauk, Chennai",2,9.6,Kolkata Knight Riders,Sunrisers Hyderabad,VR Iyer,SS Iyer,...,0,,,,,,,,,
260917,1426312,2024,2024-05-26,"MA Chidambaram Stadium, Chepauk, Chennai",2,10.1,Kolkata Knight Riders,Sunrisers Hyderabad,VR Iyer,SS Iyer,...,0,,,,,,,,,
260918,1426312,2024,2024-05-26,"MA Chidambaram Stadium, Chepauk, Chennai",2,10.2,Kolkata Knight Riders,Sunrisers Hyderabad,SS Iyer,VR Iyer,...,0,,,,,,,,,


In [22]:
df=df.sort_values(['match_id', 'innings'], ascending=[True, True])

In [23]:
df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
1624,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,0.1,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,1,1.0,,,,,,,,


In [24]:
df['overs']=df['ball']

In [25]:
#make string version of original column
df['ball']=df['ball'].astype(str)

In [26]:
df['ball'].str[0:-1]

1624       0.
1625       0.
1626       0.
1627       0.
1628       0.
         ... 
260915     9.
260916     9.
260917    10.
260918    10.
260919    10.
Name: ball, Length: 112334, dtype: object

In [27]:
df['over']=df['ball'].str[0:-1]

In [28]:
df['over'].str.replace('.', '')

1624       0
1625       0
1626       0
1627       0
1628       0
          ..
260915     9
260916     9
260917    10
260918    10
260919    10
Name: over, Length: 112334, dtype: object

In [29]:
df['over']=df['over'].str.replace('.', '')

In [30]:
df['ball'].str[-1]

1624      1
1625      2
1626      3
1627      4
1628      5
         ..
260915    5
260916    6
260917    1
260918    2
260919    3
Name: ball, Length: 112334, dtype: object

In [31]:
df['ball']=df['ball'].str[-1]

In [32]:
df['ball']= df['ball'].astype(int)

In [33]:
df['over']= df['over'].astype(int)

In [34]:
df.head(5)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,overs,over
1624,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,1,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,,,,,,,,,0.1,0
1625,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,2,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,,,,,,,,,0.2,0
1626,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,3,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,,,,,,,,,0.3,0
1627,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,4,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,,,,,,,,,0.4,0
1628,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,5,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,,,,,,,,,0.5,0


In [35]:
df['total_runs']=df['runs_off_bat']+df['extras']

In [36]:
df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,overs,over,total_runs
1624,335989,2007/08,2008-04-23,"MA Chidambaram Stadium, Chepauk",1,1,Chennai Super Kings,Mumbai Indians,PA Patel,ML Hayden,...,,,,,,,,0.1,0,1


In [37]:
df=df[['match_id', 'innings', 'batting_team', 'bowling_team', 'overs', 'over', 'ball',
       'total_runs', 'player_dismissed',]]

In [38]:
df=df.rename(columns={"match_id": "id"})

In [39]:
df

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed
1624,335989,1,Chennai Super Kings,Mumbai Indians,0.1,0,1,1,
1625,335989,1,Chennai Super Kings,Mumbai Indians,0.2,0,2,0,
1626,335989,1,Chennai Super Kings,Mumbai Indians,0.3,0,3,0,
1627,335989,1,Chennai Super Kings,Mumbai Indians,0.4,0,4,0,
1628,335989,1,Chennai Super Kings,Mumbai Indians,0.5,0,5,4,
...,...,...,...,...,...,...,...,...,...
260915,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.5,9,5,1,
260916,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.6,9,6,1,
260917,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.1,10,1,1,
260918,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.2,10,2,1,


In [40]:
df = df.replace(np.nan, 0)

In [41]:
df

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed
1624,335989,1,Chennai Super Kings,Mumbai Indians,0.1,0,1,1,0
1625,335989,1,Chennai Super Kings,Mumbai Indians,0.2,0,2,0,0
1626,335989,1,Chennai Super Kings,Mumbai Indians,0.3,0,3,0,0
1627,335989,1,Chennai Super Kings,Mumbai Indians,0.4,0,4,0,0
1628,335989,1,Chennai Super Kings,Mumbai Indians,0.5,0,5,4,0
...,...,...,...,...,...,...,...,...,...
260915,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.5,9,5,1,0
260916,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.6,9,6,1,0
260917,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.1,10,1,1,0
260918,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.2,10,2,1,0


### Total runs scored in the innings

In [43]:
df.groupby(['id', 'innings'])['total_runs'].transform('sum')

1624      208
1625      208
1626      208
1627      208
1628      208
         ... 
260915    114
260916    114
260917    114
260918    114
260919    114
Name: total_runs, Length: 112334, dtype: int64

In [44]:
df['total']=df.groupby(['id', 'innings'])['total_runs'].transform('sum')

### Runs scored till current ball

In [46]:
df.groupby(['id', 'innings'])['total_runs'].apply(lambda x: x.cumsum())

id       innings        
335989   1        1624        1
                  1625        1
                  1626        1
                  1627        1
                  1628        5
                           ... 
1426312  2        260915    110
                  260916    111
                  260917    112
                  260918    113
                  260919    114
Name: total_runs, Length: 112334, dtype: int64

In [47]:
df['total_score'] = df.groupby(['id', 'innings'])['total_runs'].transform(lambda x: x.cumsum())


In [48]:
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score
1624,335989,1,Chennai Super Kings,Mumbai Indians,0.1,0,1,1,0,208,1
1625,335989,1,Chennai Super Kings,Mumbai Indians,0.2,0,2,0,0,208,1
1626,335989,1,Chennai Super Kings,Mumbai Indians,0.3,0,3,0,0,208,1
1627,335989,1,Chennai Super Kings,Mumbai Indians,0.4,0,4,0,0,208,1
1628,335989,1,Chennai Super Kings,Mumbai Indians,0.5,0,5,4,0,208,5


### Runs scored in previous 30 balls

In [50]:
df.groupby(['id','innings'])['total_runs'].rolling(min_periods=1, window=30).sum().reset_index()

Unnamed: 0,id,innings,level_2,total_runs
0,335989,1,1624,1.0
1,335989,1,1625,1.0
2,335989,1,1626,1.0
3,335989,1,1627,1.0
4,335989,1,1628,5.0
...,...,...,...,...
112329,1426312,2,260915,58.0
112330,1426312,2,260916,59.0
112331,1426312,2,260917,56.0
112332,1426312,2,260918,53.0


In [51]:
tmp=df.groupby(['id','innings'])['total_runs'].rolling(min_periods=1, window=30).sum().reset_index()

In [52]:
tmp[['total_runs']]

Unnamed: 0,total_runs
0,1.0
1,1.0
2,1.0
3,1.0
4,5.0
...,...
112329,58.0
112330,59.0
112331,56.0
112332,53.0


In [53]:
df['prev_30_runs']=tmp['total_runs'].to_list()

In [54]:
df.head(10)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs
1624,335989,1,Chennai Super Kings,Mumbai Indians,0.1,0,1,1,0,208,1,1.0
1625,335989,1,Chennai Super Kings,Mumbai Indians,0.2,0,2,0,0,208,1,1.0
1626,335989,1,Chennai Super Kings,Mumbai Indians,0.3,0,3,0,0,208,1,1.0
1627,335989,1,Chennai Super Kings,Mumbai Indians,0.4,0,4,0,0,208,1,1.0
1628,335989,1,Chennai Super Kings,Mumbai Indians,0.5,0,5,4,0,208,5,5.0
1629,335989,1,Chennai Super Kings,Mumbai Indians,0.6,0,6,0,0,208,5,5.0
1630,335989,1,Chennai Super Kings,Mumbai Indians,0.7,0,7,2,0,208,7,7.0
1631,335989,1,Chennai Super Kings,Mumbai Indians,1.1,1,1,0,0,208,7,7.0
1632,335989,1,Chennai Super Kings,Mumbai Indians,1.2,1,2,1,0,208,8,8.0
1633,335989,1,Chennai Super Kings,Mumbai Indians,1.3,1,3,0,0,208,8,8.0


### Wickets fallen in previous 30 balls

In [56]:
df['player_dismissed'].unique()

array([0, 'PA Patel', 'MEK Hussey', 'SK Raina', 'ML Hayden', 'MS Dhoni',
       'L Ronchi', 'ST Jayasuriya', 'RV Uthappa', 'SM Pollock',
       'DJ Bravo', 'MA Khote', 'Harbhajan Singh', 'BB McCullum',
       'RT Ponting', 'WP Saha', 'DJ Hussey', 'SC Ganguly',
       'Mohammad Hafeez', 'AB Agarkar', 'M Kartik', 'LR Shukla', 'DB Das',
       'AM Rahane', 'MK Pandey', 'GC Smith', 'M Kaif', 'SA Asnodkar',
       'SR Watson', 'RA Jadeja', 'YK Pathan', 'SK Warne', 'Salman Butt',
       'BJ Hodge', 'Umar Gul', 'AB Dinda', 'SP Fleming', 'S Vidyut',
       'S Badrinath', 'Joginder Sharma', 'JA Morkel', 'M Muralitharan',
       'M Ntini', 'Sohail Tanvir', 'M Rawat', 'SK Trivedi', 'YV Takawale',
       'CK Kapugedera', 'SR Tendulkar', 'A Chopra', 'I Sharma',
       'Shoaib Akhtar', 'T Kohli', 'A Mukund', 'AM Nayar', 'DR Smith',
       'Kamran Akmal', 'Niraj Patel', 'S Dhawan', 'JP Duminy', 'Z Khan',
       'A Flintoff', 'JDP Oram', 'PC Valthaty', 'RJ Quiney',
       'AD Mascarenhas', 'CH Gayle',

In [57]:
df['player_dismissed']=np.where(df['player_dismissed']==0, 0, 1)

In [58]:
tmp=df.groupby(['id', 'innings'])['player_dismissed'].rolling(min_periods=1, window=30).sum().reset_index()

df['prev_30_wickets']=tmp['player_dismissed'].to_list()

In [59]:
df.head(60)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets
1624,335989,1,Chennai Super Kings,Mumbai Indians,0.1,0,1,1,0,208,1,1.0,0.0
1625,335989,1,Chennai Super Kings,Mumbai Indians,0.2,0,2,0,0,208,1,1.0,0.0
1626,335989,1,Chennai Super Kings,Mumbai Indians,0.3,0,3,0,0,208,1,1.0,0.0
1627,335989,1,Chennai Super Kings,Mumbai Indians,0.4,0,4,0,0,208,1,1.0,0.0
1628,335989,1,Chennai Super Kings,Mumbai Indians,0.5,0,5,4,0,208,5,5.0,0.0
1629,335989,1,Chennai Super Kings,Mumbai Indians,0.6,0,6,0,0,208,5,5.0,0.0
1630,335989,1,Chennai Super Kings,Mumbai Indians,0.7,0,7,2,0,208,7,7.0,0.0
1631,335989,1,Chennai Super Kings,Mumbai Indians,1.1,1,1,0,0,208,7,7.0,0.0
1632,335989,1,Chennai Super Kings,Mumbai Indians,1.2,1,2,1,0,208,8,8.0,0.0
1633,335989,1,Chennai Super Kings,Mumbai Indians,1.3,1,3,0,0,208,8,8.0,0.0


###  Wickets fallen till the current ball

In [61]:
df.groupby(['id', 'innings'])['player_dismissed'].apply(lambda x: x.cumsum())

id       innings        
335989   1        1624      0
                  1625      0
                  1626      0
                  1627      0
                  1628      0
                           ..
1426312  2        260915    2
                  260916    2
                  260917    2
                  260918    2
                  260919    2
Name: player_dismissed, Length: 112334, dtype: int32

In [62]:
df['total_wickets'] = df.groupby(['id', 'innings'])['player_dismissed'].transform(lambda x: x.cumsum())


### Dot balls in previous 30 balls

In [64]:
df['prev_30_dot_balls']=df['total_runs']
df['prev_30_dot_balls']=np.where(df['prev_30_dot_balls']==0, 1, 0)

tmp=df.groupby(['id', 'innings'])['prev_30_dot_balls'].rolling(min_periods=1, window=30).sum().reset_index()
df['prev_30_dot_balls']=tmp['prev_30_dot_balls'].to_list()

### Boundaries in previous 30 balls

In [66]:
df['prev_30_boundaries']=df['total_runs']
df['prev_30_boundaries']=np.where(df['prev_30_boundaries']>3, 1, 0)

tmp=df.groupby(['id', 'innings'])['prev_30_boundaries'].rolling(min_periods=1, window=30).sum().reset_index()
df['prev_30_boundaries']=tmp['prev_30_boundaries'].to_list()

### Run rate till current ball

In [68]:
df['total_balls']=df['over']*6+df['ball']

In [69]:
df.head(20)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls
1624,335989,1,Chennai Super Kings,Mumbai Indians,0.1,0,1,1,0,208,1,1.0,0.0,0,0.0,0.0,1
1625,335989,1,Chennai Super Kings,Mumbai Indians,0.2,0,2,0,0,208,1,1.0,0.0,0,1.0,0.0,2
1626,335989,1,Chennai Super Kings,Mumbai Indians,0.3,0,3,0,0,208,1,1.0,0.0,0,2.0,0.0,3
1627,335989,1,Chennai Super Kings,Mumbai Indians,0.4,0,4,0,0,208,1,1.0,0.0,0,3.0,0.0,4
1628,335989,1,Chennai Super Kings,Mumbai Indians,0.5,0,5,4,0,208,5,5.0,0.0,0,3.0,1.0,5
1629,335989,1,Chennai Super Kings,Mumbai Indians,0.6,0,6,0,0,208,5,5.0,0.0,0,4.0,1.0,6
1630,335989,1,Chennai Super Kings,Mumbai Indians,0.7,0,7,2,0,208,7,7.0,0.0,0,4.0,1.0,7
1631,335989,1,Chennai Super Kings,Mumbai Indians,1.1,1,1,0,0,208,7,7.0,0.0,0,5.0,1.0,7
1632,335989,1,Chennai Super Kings,Mumbai Indians,1.2,1,2,1,0,208,8,8.0,0.0,0,5.0,1.0,8
1633,335989,1,Chennai Super Kings,Mumbai Indians,1.3,1,3,0,0,208,8,8.0,0.0,0,6.0,1.0,9


In [70]:
df['run_rate']=6*(df['total_score'])/df['total_balls']

In [71]:
df.head(100)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate
1624,335989,1,Chennai Super Kings,Mumbai Indians,0.1,0,1,1,0,208,1,1.0,0.0,0,0.0,0.0,1,6.000000
1625,335989,1,Chennai Super Kings,Mumbai Indians,0.2,0,2,0,0,208,1,1.0,0.0,0,1.0,0.0,2,3.000000
1626,335989,1,Chennai Super Kings,Mumbai Indians,0.3,0,3,0,0,208,1,1.0,0.0,0,2.0,0.0,3,2.000000
1627,335989,1,Chennai Super Kings,Mumbai Indians,0.4,0,4,0,0,208,1,1.0,0.0,0,3.0,0.0,4,1.500000
1628,335989,1,Chennai Super Kings,Mumbai Indians,0.5,0,5,4,0,208,5,5.0,0.0,0,3.0,1.0,5,6.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1719,335989,1,Chennai Super Kings,Mumbai Indians,15.1,15,1,1,0,208,144,55.0,1.0,3,6.0,8.0,91,9.494505
1720,335989,1,Chennai Super Kings,Mumbai Indians,15.2,15,2,4,0,208,148,58.0,1.0,3,6.0,9.0,92,9.652174
1721,335989,1,Chennai Super Kings,Mumbai Indians,15.3,15,3,0,0,208,148,57.0,1.0,3,7.0,9.0,93,9.548387
1722,335989,1,Chennai Super Kings,Mumbai Indians,15.4,15,4,4,0,208,152,57.0,1.0,3,7.0,9.0,94,9.702128


### Run rate in previous 30 balls

In [73]:
df.groupby(['id','innings'])['run_rate'].rolling(min_periods=1, window=30).mean().reset_index()

Unnamed: 0,id,innings,level_2,run_rate
0,335989,1,1624,6.000000
1,335989,1,1625,4.500000
2,335989,1,1626,3.666667
3,335989,1,1627,3.125000
4,335989,1,1628,3.700000
...,...,...,...,...
112329,1426312,2,260915,11.676370
112330,1426312,2,260916,11.699704
112331,1426312,2,260917,11.705626
112332,1426312,2,260918,11.695143


In [74]:
tmp=df.groupby(['id','innings'])['run_rate'].rolling(min_periods=1, window=30).mean().reset_index()

In [75]:
tmp['run_rate']

0          6.000000
1          4.500000
2          3.666667
3          3.125000
4          3.700000
            ...    
112329    11.676370
112330    11.699704
112331    11.705626
112332    11.695143
112333    11.657047
Name: run_rate, Length: 112334, dtype: float64

In [76]:
df['prev_30_run_rate']=tmp['run_rate'].to_list()

In [77]:
df.head(5)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate,prev_30_run_rate
1624,335989,1,Chennai Super Kings,Mumbai Indians,0.1,0,1,1,0,208,1,1.0,0.0,0,0.0,0.0,1,6.0,6.0
1625,335989,1,Chennai Super Kings,Mumbai Indians,0.2,0,2,0,0,208,1,1.0,0.0,0,1.0,0.0,2,3.0,4.5
1626,335989,1,Chennai Super Kings,Mumbai Indians,0.3,0,3,0,0,208,1,1.0,0.0,0,2.0,0.0,3,2.0,3.666667
1627,335989,1,Chennai Super Kings,Mumbai Indians,0.4,0,4,0,0,208,1,1.0,0.0,0,3.0,0.0,4,1.5,3.125
1628,335989,1,Chennai Super Kings,Mumbai Indians,0.5,0,5,4,0,208,5,5.0,0.0,0,3.0,1.0,5,6.0,3.7


In [78]:
df.tail(5)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate,prev_30_run_rate
260915,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.5,9,5,1,0,114,110,58.0,1.0,2,8.0,9.0,59,11.186441,11.67637
260916,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.6,9,6,1,0,114,111,59.0,1.0,2,7.0,9.0,60,11.1,11.699704
260917,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.1,10,1,1,0,114,112,56.0,1.0,2,7.0,8.0,61,11.016393,11.705626
260918,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.2,10,2,1,0,114,113,53.0,1.0,2,7.0,7.0,62,10.935484,11.695143
260919,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.3,10,3,1,0,114,114,48.0,1.0,2,7.0,6.0,63,10.857143,11.657047


In [79]:
convert_dict = {'prev_30_runs':int,
               'prev_30_wickets':int,
               'prev_30_dot_balls':int,
               'prev_30_boundaries':int,
               'run_rate':int,
               'prev_30_run_rate':int,
               }
df = df.astype(convert_dict)

In [80]:
df 

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate,prev_30_run_rate
1624,335989,1,Chennai Super Kings,Mumbai Indians,0.1,0,1,1,0,208,1,1,0,0,0,0,1,6,6
1625,335989,1,Chennai Super Kings,Mumbai Indians,0.2,0,2,0,0,208,1,1,0,0,1,0,2,3,4
1626,335989,1,Chennai Super Kings,Mumbai Indians,0.3,0,3,0,0,208,1,1,0,0,2,0,3,2,3
1627,335989,1,Chennai Super Kings,Mumbai Indians,0.4,0,4,0,0,208,1,1,0,0,3,0,4,1,3
1628,335989,1,Chennai Super Kings,Mumbai Indians,0.5,0,5,4,0,208,5,5,0,0,3,1,5,6,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260915,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.5,9,5,1,0,114,110,58,1,2,8,9,59,11,11
260916,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,9.6,9,6,1,0,114,111,59,1,2,7,9,60,11,11
260917,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.1,10,1,1,0,114,112,56,1,2,7,8,61,11,11
260918,1426312,2,Kolkata Knight Riders,Sunrisers Hyderabad,10.2,10,2,1,0,114,113,53,1,2,7,7,62,10,11


In [81]:
df.to_csv('Data_with_add_features.csv', index=None)

### Importing the Libraries

In [83]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.linear_model import LinearRegression  # For building a linear regression model
import pickle  # For saving and loading the trained model
from sklearn.metrics import mean_absolute_error  # For evaluating model performance


### Loading the data

In [85]:
df= pd.read_csv("Data_with_add_features.csv")

In [86]:
df.head(5)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate,prev_30_run_rate
0,335989,1,Chennai Super Kings,Mumbai Indians,0.1,0,1,1,0,208,1,1,0,0,0,0,1,6,6
1,335989,1,Chennai Super Kings,Mumbai Indians,0.2,0,2,0,0,208,1,1,0,0,1,0,2,3,4
2,335989,1,Chennai Super Kings,Mumbai Indians,0.3,0,3,0,0,208,1,1,0,0,2,0,3,2,3
3,335989,1,Chennai Super Kings,Mumbai Indians,0.4,0,4,0,0,208,1,1,0,0,3,0,4,1,3
4,335989,1,Chennai Super Kings,Mumbai Indians,0.5,0,5,4,0,208,5,5,0,0,3,1,5,6,3


In [87]:
pd.get_dummies(data=df, columns=['batting_team', 'bowling_team'])

Unnamed: 0,id,innings,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,...,bowling_team_Chennai Super Kings,bowling_team_Delhi Capitals,bowling_team_Gujarat Titans,bowling_team_Kolkata Knight Riders,bowling_team_Lucknow Super Giants,bowling_team_Mumbai Indians,bowling_team_Punjab Kings,bowling_team_Rajasthan Royals,bowling_team_Royal Challengers Bengaluru,bowling_team_Sunrisers Hyderabad
0,335989,1,0.1,0,1,1,0,208,1,1,...,False,False,False,False,False,True,False,False,False,False
1,335989,1,0.2,0,2,0,0,208,1,1,...,False,False,False,False,False,True,False,False,False,False
2,335989,1,0.3,0,3,0,0,208,1,1,...,False,False,False,False,False,True,False,False,False,False
3,335989,1,0.4,0,4,0,0,208,1,1,...,False,False,False,False,False,True,False,False,False,False
4,335989,1,0.5,0,5,4,0,208,5,5,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112329,1426312,2,9.5,9,5,1,0,114,110,58,...,False,False,False,False,False,False,False,False,False,True
112330,1426312,2,9.6,9,6,1,0,114,111,59,...,False,False,False,False,False,False,False,False,False,True
112331,1426312,2,10.1,10,1,1,0,114,112,56,...,False,False,False,False,False,False,False,False,False,True
112332,1426312,2,10.2,10,2,1,0,114,113,53,...,False,False,False,False,False,False,False,False,False,True


In [88]:
df=pd.get_dummies(data=df, columns=['batting_team', 'bowling_team'])

In [89]:
df.columns

Index(['id', 'innings', 'overs', 'over', 'ball', 'total_runs',
       'player_dismissed', 'total', 'total_score', 'prev_30_runs',
       'prev_30_wickets', 'total_wickets', 'prev_30_dot_balls',
       'prev_30_boundaries', 'total_balls', 'run_rate', 'prev_30_run_rate',
       'batting_team_Chennai Super Kings', 'batting_team_Delhi Capitals',
       'batting_team_Gujarat Titans', 'batting_team_Kolkata Knight Riders',
       'batting_team_Lucknow Super Giants', 'batting_team_Mumbai Indians',
       'batting_team_Punjab Kings', 'batting_team_Rajasthan Royals',
       'batting_team_Royal Challengers Bengaluru',
       'batting_team_Sunrisers Hyderabad', 'bowling_team_Chennai Super Kings',
       'bowling_team_Delhi Capitals', 'bowling_team_Gujarat Titans',
       'bowling_team_Kolkata Knight Riders',
       'bowling_team_Lucknow Super Giants', 'bowling_team_Mumbai Indians',
       'bowling_team_Punjab Kings', 'bowling_team_Rajasthan Royals',
       'bowling_team_Royal Challengers Bengaluru

In [90]:
df=df[['id','batting_team_Chennai Super Kings', 'batting_team_Delhi Capitals',
       'batting_team_Gujarat Titans', 'batting_team_Kolkata Knight Riders',
       'batting_team_Lucknow Super Giants', 'batting_team_Mumbai Indians',
       'batting_team_Punjab Kings', 'batting_team_Rajasthan Royals',
       'batting_team_Royal Challengers Bengaluru',
       'batting_team_Sunrisers Hyderabad', 'bowling_team_Chennai Super Kings',
       'bowling_team_Delhi Capitals', 'bowling_team_Gujarat Titans',
       'bowling_team_Kolkata Knight Riders',
       'bowling_team_Lucknow Super Giants', 'bowling_team_Mumbai Indians',
       'bowling_team_Punjab Kings', 'bowling_team_Rajasthan Royals',
       'bowling_team_Royal Challengers Bengaluru',
       'bowling_team_Sunrisers Hyderabad','overs', 'total_score','total_wickets', 'run_rate', 'prev_30_runs',
       'prev_30_wickets', 'prev_30_dot_balls','prev_30_boundaries', 'prev_30_run_rate','total']]

In [91]:
df.head()

Unnamed: 0,id,batting_team_Chennai Super Kings,batting_team_Delhi Capitals,batting_team_Gujarat Titans,batting_team_Kolkata Knight Riders,batting_team_Lucknow Super Giants,batting_team_Mumbai Indians,batting_team_Punjab Kings,batting_team_Rajasthan Royals,batting_team_Royal Challengers Bengaluru,...,overs,total_score,total_wickets,run_rate,prev_30_runs,prev_30_wickets,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate,total
0,335989,True,False,False,False,False,False,False,False,False,...,0.1,1,0,6,1,0,0,0,6,208
1,335989,True,False,False,False,False,False,False,False,False,...,0.2,1,0,3,1,0,1,0,4,208
2,335989,True,False,False,False,False,False,False,False,False,...,0.3,1,0,2,1,0,2,0,3,208
3,335989,True,False,False,False,False,False,False,False,False,...,0.4,1,0,1,1,0,3,0,3,208
4,335989,True,False,False,False,False,False,False,False,False,...,0.5,5,0,6,5,0,3,1,3,208


### train_test_split

In [93]:
X= df.drop(labels=['total','id'], axis=1)
y= df['total'].values

In [94]:
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42,stratify=y)

In [95]:
X_train= X_train.values
X_test=X_test.values
X_train=np.asarray(X_train).astype(np.float32)
X_test=np.asarray(X_test).astype(np.float32)

In [96]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(84250, 29) (28084, 29) (84250,) (28084,)


### Training LR model

In [98]:
LR_model = LinearRegression()
LR_model.fit(X_train, y_train)

In [99]:
#Creating a pickle file for the classifier
filename= 'lr-model.pkl'
pickle.dump(LR_model, open(filename, 'wb'))

In [100]:
prediction=LR_model.predict(X_test)
mean_absolute_error (y_test, prediction)

16.952388188044683

In [101]:
# Function to predict scores :

def score_prediction(Bat_team,Bowl_team,overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate):
     
    temp_array = list()
    
    if Bat_team == 'CSK':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0,0]
    elif Bat_team == 'DC':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0,0]
    elif Bat_team == 'GT':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0,0]
    elif Bat_team == 'KKR':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0,0]
    elif Bat_team == 'LSG':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0,0]
    elif Bat_team == 'MI':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0,0]
    elif Bat_team == 'PK':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0,0]
    elif Bat_team == 'RR':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0,0]
    elif Bat_team == 'RCB':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1,0]
    elif Bat_team == 'SRH':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,0,1]
   
        
    if Bowl_team == 'CSK':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0,0]
    elif Bowl_team == 'DC':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0,0]
    elif Bowl_team == 'GT':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0,0]
    elif Bowl_team == 'KKR':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0,0]
    elif Bowl_team == 'LSG':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0,0]
    elif Bowl_team == 'MI':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0,0]
    elif Bowl_team == 'PK':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0,0]
    elif Bowl_team == 'RR':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0,0]
    elif Bowl_team == 'RCB':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1,0]
    elif Bowl_team == 'SRH':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,0,1]
    temp_array = temp_array + [overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate]
    data = np.array([temp_array])
    my_prediction = int(LR_model.predict(data))
    
    print('Predicted score: ',my_prediction)
    print('Predicted score range: ',my_prediction -20, 'to' ,my_prediction + 20)


In [102]:
Bat_team = 'RR' 

Bowl_team = 'SRH' 

overs = 10.3

total_score = 101 # current score

total_wickets = 2 # current wicket 

run_rate = 10

prev_runs_30 = 30 # runs in prev 30 balls

prev_wickets_30 = 1 # wickets in prev 30 balls

prev_30_dot_balls = 5 # dots in prev 30 balls

prev_30_boundaries = 7 # boundaries in prev 30 balls

prev_30_run_rate = 8

score_prediction(Bat_team,Bowl_team,overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate)

Predicted score:  181
Predicted score range:  161 to 201
