# Assignment - Add one or more additional features to the existing ML model & find out its accuracy

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.metrics import mean_absolute_error  ,r2_score

In [2]:
df= pd.read_csv("Data.csv")

In [3]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.1,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
1,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.2,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
2,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.3,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,
3,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.4,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,
4,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.5,Australia,Sri Lanka,M Klinger,AJ Finch,...,0,,,,,,,,,


## Feature Engineering

In [4]:
df['batting_team'].unique()

array(['Australia', 'Sri Lanka', 'Hong Kong', 'Ireland', 'Zimbabwe',
       'India', 'Bangladesh', 'New Zealand', 'South Africa', 'England',
       'West Indies', 'Afghanistan', 'Pakistan', 'United Arab Emirates',
       'Scotland', 'Oman', 'Netherlands', 'Papua New Guinea',
       'ICC World XI', 'Nepal', 'Philippines', 'Vanuatu',
       'United States of America', 'Germany', 'Italy', 'Ghana', 'Namibia',
       'Uganda', 'Botswana', 'Kenya', 'Nigeria', 'Guernsey', 'Denmark',
       'Norway', 'Jersey', 'Thailand', 'Malaysia', 'Maldives',
       'Singapore', 'Qatar', 'Kuwait', 'Bermuda', 'Canada',
       'Cayman Islands', 'Portugal', 'Spain', 'Gibraltar', 'Bhutan',
       'Saudi Arabia', 'Bahrain', 'Iran', 'Belgium', 'Luxembourg',
       'Czech Republic', 'Isle of Man', 'Bulgaria', 'Romania'],
      dtype=object)

## Subsetting only consistent teams

In [5]:
consistent_teams=['England','Australia','West Indies','India','Bangladesh','New Zealand','Pakistan','South Africa','Sri Lanka']

In [6]:
df['batting_team'].isin(consistent_teams)

0         True
1         True
2         True
3         True
4         True
          ... 
221728    True
221729    True
221730    True
221731    True
221732    True
Name: batting_team, Length: 221733, dtype: bool

In [7]:
df=df[df['batting_team'].isin(consistent_teams)]

In [8]:
df=df[df['bowling_team'].isin(consistent_teams)]

In [9]:
df['batting_team'].unique()

array(['Australia', 'Sri Lanka', 'Bangladesh', 'New Zealand',
       'South Africa', 'England', 'West Indies', 'India', 'Pakistan'],
      dtype=object)

In [10]:
df['bowling_team'].unique()

array(['Sri Lanka', 'Australia', 'New Zealand', 'Bangladesh',
       'South Africa', 'England', 'West Indies', 'India', 'Pakistan'],
      dtype=object)

In [11]:
df.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [12]:
df.head(1)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1001349,2016/17,2017-02-17,Melbourne Cricket Ground,1,0.1,Australia,Sri Lanka,AJ Finch,M Klinger,...,0,,,,,,,,,


## Subsetting the data with required features

In [13]:
df.sort_values(['match_id', 'innings'], ascending=[True, True])

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
101048,211028,2005,2005-06-13,The Rose Bowl,1,0.1,England,Australia,ME Trescothick,GO Jones,...,0,,,,,,,,,
101049,211028,2005,2005-06-13,The Rose Bowl,1,0.2,England,Australia,ME Trescothick,GO Jones,...,0,,,,,,,,,
101050,211028,2005,2005-06-13,The Rose Bowl,1,0.3,England,Australia,GO Jones,ME Trescothick,...,0,,,,,,,,,
101051,211028,2005,2005-06-13,The Rose Bowl,1,0.4,England,Australia,GO Jones,ME Trescothick,...,0,,,,,,,,,
101052,211028,2005,2005-06-13,The Rose Bowl,1,0.5,England,Australia,GO Jones,ME Trescothick,...,0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101043,1237124,2020/21,2020-12-01,Newlands,2,17.1,England,South Africa,JC Buttler,DJ Malan,...,0,,,,,,,,,
101044,1237124,2020/21,2020-12-01,Newlands,2,17.2,England,South Africa,DJ Malan,JC Buttler,...,0,,,,,,,,,
101045,1237124,2020/21,2020-12-01,Newlands,2,17.3,England,South Africa,DJ Malan,JC Buttler,...,1,1.0,,,,,,,,
101046,1237124,2020/21,2020-12-01,Newlands,2,17.4,England,South Africa,DJ Malan,JC Buttler,...,0,,,,,,,,,


In [14]:
df=df.sort_values(['match_id', 'innings'], ascending=[True, True])

In [15]:
df.head(2)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
101048,211028,2005,2005-06-13,The Rose Bowl,1,0.1,England,Australia,ME Trescothick,GO Jones,...,0,,,,,,,,,
101049,211028,2005,2005-06-13,The Rose Bowl,1,0.2,England,Australia,ME Trescothick,GO Jones,...,0,,,,,,,,,


In [16]:
df['overs']=df['ball']

In [17]:
df['ball'] = df['ball'].astype(str)

In [18]:
df['ball'].str[0:-1]

101048     0.
101049     0.
101050     0.
101051     0.
101052     0.
         ... 
101043    17.
101044    17.
101045    17.
101046    17.
101047    17.
Name: ball, Length: 119582, dtype: object

In [19]:
df['over']=df['ball'].str[0:-1]

In [20]:
df['over'].str.replace('.', '')

  df['over'].str.replace('.', '')


101048     0
101049     0
101050     0
101051     0
101052     0
          ..
101043    17
101044    17
101045    17
101046    17
101047    17
Name: over, Length: 119582, dtype: object

In [21]:
df['over']=df['over'].str.replace('.', '')

  df['over']=df['over'].str.replace('.', '')


In [22]:
df['ball'].str[-1:]

101048    1
101049    2
101050    3
101051    4
101052    5
         ..
101043    1
101044    2
101045    3
101046    4
101047    5
Name: ball, Length: 119582, dtype: object

In [23]:
df['ball']=df['ball'].str[-1:]

In [24]:
df.head(5)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,overs,over
101048,211028,2005,2005-06-13,The Rose Bowl,1,1,England,Australia,ME Trescothick,GO Jones,...,,,,,,,,,0.1,0
101049,211028,2005,2005-06-13,The Rose Bowl,1,2,England,Australia,ME Trescothick,GO Jones,...,,,,,,,,,0.2,0
101050,211028,2005,2005-06-13,The Rose Bowl,1,3,England,Australia,GO Jones,ME Trescothick,...,,,,,,,,,0.3,0
101051,211028,2005,2005-06-13,The Rose Bowl,1,4,England,Australia,GO Jones,ME Trescothick,...,,,,,,,,,0.4,0
101052,211028,2005,2005-06-13,The Rose Bowl,1,5,England,Australia,GO Jones,ME Trescothick,...,,,,,,,,,0.5,0


In [25]:
df['total_runs']=df['runs_off_bat']+df['extras']

In [26]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,overs,over,total_runs
101048,211028,2005,2005-06-13,The Rose Bowl,1,1,England,Australia,ME Trescothick,GO Jones,...,,,,,,,,0.1,0,0
101049,211028,2005,2005-06-13,The Rose Bowl,1,2,England,Australia,ME Trescothick,GO Jones,...,,,,,,,,0.2,0,1
101050,211028,2005,2005-06-13,The Rose Bowl,1,3,England,Australia,GO Jones,ME Trescothick,...,,,,,,,,0.3,0,0
101051,211028,2005,2005-06-13,The Rose Bowl,1,4,England,Australia,GO Jones,ME Trescothick,...,,,,,,,,0.4,0,0
101052,211028,2005,2005-06-13,The Rose Bowl,1,5,England,Australia,GO Jones,ME Trescothick,...,,,,,,,,0.5,0,0


In [27]:
df=df[['match_id','innings','batting_team','bowling_team','overs','over','ball','total_runs','player_dismissed']]

In [28]:
df.head()

Unnamed: 0,match_id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed
101048,211028,1,England,Australia,0.1,0,1,0,
101049,211028,1,England,Australia,0.2,0,2,1,
101050,211028,1,England,Australia,0.3,0,3,0,
101051,211028,1,England,Australia,0.4,0,4,0,
101052,211028,1,England,Australia,0.5,0,5,0,


In [29]:
df=df.rename(columns={"match_id": "id"})

In [30]:
df = df.replace(np.nan, 0)

In [31]:
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed
101048,211028,1,England,Australia,0.1,0,1,0,0
101049,211028,1,England,Australia,0.2,0,2,1,0
101050,211028,1,England,Australia,0.3,0,3,0,0
101051,211028,1,England,Australia,0.4,0,4,0,0
101052,211028,1,England,Australia,0.5,0,5,0,0


## Total runs scored in innings

In [32]:
df.groupby(['id','innings'])['total_runs'].transform('sum')

101048    179
101049    179
101050    179
101051    179
101052    179
         ... 
101043    192
101044    192
101045    192
101046    192
101047    192
Name: total_runs, Length: 119582, dtype: int64

In [33]:
df['total']=df.groupby(['id','innings'])['total_runs'].transform('sum')

## Runs Scored till current ball

In [34]:
df.groupby(['id', 'innings'])['total_runs'].apply(lambda x: x.cumsum())

101048      0
101049      1
101050      1
101051      1
101052      1
         ... 
101043    180
101044    186
101045    187
101046    191
101047    192
Name: total_runs, Length: 119582, dtype: int64

In [35]:
df['total_score']=df.groupby(['id', 'innings'])['total_runs'].apply(lambda x: x.cumsum())

In [36]:
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score
101048,211028,1,England,Australia,0.1,0,1,0,0,179,0
101049,211028,1,England,Australia,0.2,0,2,1,0,179,1
101050,211028,1,England,Australia,0.3,0,3,0,0,179,1
101051,211028,1,England,Australia,0.4,0,4,0,0,179,1
101052,211028,1,England,Australia,0.5,0,5,0,0,179,1


## Runs scored in previous 30 balls

In [37]:
df.groupby(['id','innings'])['total_runs'].rolling(min_periods=1, window=30).sum().reset_index()

Unnamed: 0,id,innings,level_2,total_runs
0,211028,1,101048,0.0
1,211028,1,101049,1.0
2,211028,1,101050,1.0
3,211028,1,101051,1.0
4,211028,1,101052,1.0
...,...,...,...,...
119577,1237124,2,101043,52.0
119578,1237124,2,101044,58.0
119579,1237124,2,101045,58.0
119580,1237124,2,101046,61.0


In [38]:
tmp=df.groupby(['id','innings'])['total_runs'].rolling(min_periods=1, window=30).sum().reset_index()

In [39]:
tmp[['total_runs']]

Unnamed: 0,total_runs
0,0.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
119577,52.0
119578,58.0
119579,58.0
119580,61.0


In [40]:
df['prev_30_runs']=tmp['total_runs'].to_list()

In [41]:
df.head(30)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs
101048,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0
101049,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0
101050,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0
101051,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0
101052,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0
101053,211028,1,England,Australia,0.6,0,6,1,0,179,2,2.0
101054,211028,1,England,Australia,0.7,0,7,2,0,179,4,4.0
101055,211028,1,England,Australia,1.1,1,1,0,0,179,4,4.0
101056,211028,1,England,Australia,1.2,1,2,0,0,179,4,4.0
101057,211028,1,England,Australia,1.3,1,3,1,0,179,5,5.0


## Wickets fallen in previous 30 balls

In [42]:
df['player_dismissed'].unique()

array([0, 'GO Jones', 'A Flintoff', 'KP Pietersen', 'MP Vaughan',
       'ME Trescothick', 'AJ Strauss', 'VS Solanki', 'PD Collingwood',
       'AC Gilchrist', 'ML Hayden', 'MJ Clarke', 'A Symonds',
       'MEK Hussey', 'RT Ponting', 'DR Martyn', 'JN Gillespie', 'B Lee',
       'GD McGrath', 'SM Katich', 'SP Fleming', 'MS Sinclair',
       'BB McCullum', 'CD McMillan', 'CL Cairns', 'HJH Marshall',
       'AR Adams', 'SB Styris', 'JW Wilson', 'KD Mills', 'GC Smith',
       'JH Kallis', 'HH Gibbs', 'JM Kemp', 'SM Pollock', 'MV Boucher',
       'AG Prince', 'CK Langeveldt', 'JA Morkel', 'M Ntini', 'NJ Astle',
       'IR Bell', 'JWM Dalrymple', 'CMW Read', 'Shoaib Malik',
       'Younis Khan', 'Shahid Afridi', 'Mohammad Yousuf',
       'Mohammad Hafeez', 'WU Tharanga', 'DPMD Jayawardene',
       'ST Jayasuriya', 'TM Dilshan', 'RP Arnold', 'KC Sangakkara',
       'MF Maharoof', 'CK Kapugedera', 'SL Malinga', 'CRD Fernando',
       'JR Hopes', 'HH Dippenaar', 'AJ Hall', 'J Botha', 'GJP Kruge

In [43]:
df['player_dismissed']=np.where(df['player_dismissed']==0, 0, 1)

In [44]:
tmp=df.groupby(['id','innings'])['player_dismissed'].rolling(min_periods=1, window=30).sum().reset_index()

df['prev_30_wickets']=tmp['player_dismissed'].to_list()

In [45]:
df.head(60)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets
101048,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0,0.0
101049,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0,0.0
101050,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0,0.0
101051,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0,0.0
101052,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0,0.0
101053,211028,1,England,Australia,0.6,0,6,1,0,179,2,2.0,0.0
101054,211028,1,England,Australia,0.7,0,7,2,0,179,4,4.0,0.0
101055,211028,1,England,Australia,1.1,1,1,0,0,179,4,4.0,0.0
101056,211028,1,England,Australia,1.2,1,2,0,0,179,4,4.0,0.0
101057,211028,1,England,Australia,1.3,1,3,1,0,179,5,5.0,0.0


# Wickets fallen till the current ball

In [46]:
df.groupby(['id', 'innings'])['player_dismissed'].apply(lambda x: x.cumsum())

101048    0
101049    0
101050    0
101051    0
101052    0
         ..
101043    1
101044    1
101045    1
101046    1
101047    1
Name: player_dismissed, Length: 119582, dtype: int32

In [47]:
df['total_wickets']=df.groupby(['id', 'innings'])['player_dismissed'].apply(lambda x: x.cumsum())

# Dot balls in previous 30 balls

In [48]:
df['prev_30_dot_balls']=df['total_runs']
df['prev_30_dot_balls']=np.where(df['prev_30_dot_balls']==0, 1, 0)

tmp=df.groupby(['id','innings'])['prev_30_dot_balls'].rolling(min_periods=1, window=30).sum().reset_index()
df['prev_30_dot_balls']=tmp['prev_30_dot_balls'].to_list()

# Boundaries in previous 30 balls

In [49]:
df['prev_30_boundaries']=df['total_runs']
df['prev_30_boundaries']=np.where(df['prev_30_boundaries']>3, 1, 0)

tmp=df.groupby(['id','innings'])['prev_30_boundaries'].rolling(min_periods=1, window=30).sum().reset_index()
df['prev_30_boundaries']=tmp['prev_30_boundaries'].to_list()

In [50]:
df.head(5)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries
101048,211028,1,England,Australia,0.1,0,1,0,0,179,0,0.0,0.0,0,1.0,0.0
101049,211028,1,England,Australia,0.2,0,2,1,0,179,1,1.0,0.0,0,1.0,0.0
101050,211028,1,England,Australia,0.3,0,3,0,0,179,1,1.0,0.0,0,2.0,0.0
101051,211028,1,England,Australia,0.4,0,4,0,0,179,1,1.0,0.0,0,3.0,0.0
101052,211028,1,England,Australia,0.5,0,5,0,0,179,1,1.0,0.0,0,4.0,0.0


In [51]:
df.dtypes

id                      int64
innings                 int64
batting_team           object
bowling_team           object
overs                 float64
over                   object
ball                   object
total_runs              int64
player_dismissed        int32
total                   int64
total_score             int64
prev_30_runs          float64
prev_30_wickets       float64
total_wickets           int32
prev_30_dot_balls     float64
prev_30_boundaries    float64
dtype: object

In [52]:
convert_dict = {'prev_30_runs': int, 
                'prev_30_wickets': int,
                'prev_30_dot_balls':int,
                'prev_30_boundaries':int,
                'over' :int,
                'ball' : int
               } 
  
df = df.astype(convert_dict) 

In [53]:
df.dtypes

id                      int64
innings                 int64
batting_team           object
bowling_team           object
overs                 float64
over                    int32
ball                    int32
total_runs              int64
player_dismissed        int32
total                   int64
total_score             int64
prev_30_runs            int32
prev_30_wickets         int32
total_wickets           int32
prev_30_dot_balls       int32
prev_30_boundaries      int32
dtype: object

## Run rate till current ball

In [54]:
df['total_balls']=df['over']*6+df['ball']

In [55]:
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls
101048,211028,1,England,Australia,0.1,0,1,0,0,179,0,0,0,0,1,0,1
101049,211028,1,England,Australia,0.2,0,2,1,0,179,1,1,0,0,1,0,2
101050,211028,1,England,Australia,0.3,0,3,0,0,179,1,1,0,0,2,0,3
101051,211028,1,England,Australia,0.4,0,4,0,0,179,1,1,0,0,3,0,4
101052,211028,1,England,Australia,0.5,0,5,0,0,179,1,1,0,0,4,0,5


In [56]:
df['run_rate']=6*(df['total_score'])/df['total_balls']

## Run rate in previous 30 balls

In [57]:
df.groupby(['id','innings'])['run_rate'].rolling(min_periods=1, window=30).mean().reset_index()

Unnamed: 0,id,innings,level_2,run_rate
0,211028,1,101048,0.000000
1,211028,1,101049,1.500000
2,211028,1,101050,1.666667
3,211028,1,101051,1.625000
4,211028,1,101052,1.540000
...,...,...,...,...
119577,1237124,2,101043,10.453850
119578,1237124,2,101044,10.474700
119579,1237124,2,101045,10.495826
119580,1237124,2,101046,10.522870


In [58]:
tmp=df.groupby(['id','innings'])['run_rate'].rolling(min_periods=1, window=30).mean().reset_index()

In [59]:
tmp['run_rate']

0          0.000000
1          1.500000
2          1.666667
3          1.625000
4          1.540000
            ...    
119577    10.453850
119578    10.474700
119579    10.495826
119580    10.522870
119581    10.552634
Name: run_rate, Length: 119582, dtype: float64

In [60]:
df['prev_30_run_rate']=tmp['run_rate'].to_list()

In [61]:
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate,prev_30_run_rate
101048,211028,1,England,Australia,0.1,0,1,0,0,179,0,0,0,0,1,0,1,0.0,0.0
101049,211028,1,England,Australia,0.2,0,2,1,0,179,1,1,0,0,1,0,2,3.0,1.5
101050,211028,1,England,Australia,0.3,0,3,0,0,179,1,1,0,0,2,0,3,2.0,1.666667
101051,211028,1,England,Australia,0.4,0,4,0,0,179,1,1,0,0,3,0,4,1.5,1.625
101052,211028,1,England,Australia,0.5,0,5,0,0,179,1,1,0,0,4,0,5,1.2,1.54


In [62]:
df.dtypes

id                      int64
innings                 int64
batting_team           object
bowling_team           object
overs                 float64
over                    int32
ball                    int32
total_runs              int64
player_dismissed        int32
total                   int64
total_score             int64
prev_30_runs            int32
prev_30_wickets         int32
total_wickets           int32
prev_30_dot_balls       int32
prev_30_boundaries      int32
total_balls             int32
run_rate              float64
prev_30_run_rate      float64
dtype: object

In [63]:
convert_dict = {'run_rate':int,
               'prev_30_run_rate':int,
               }
df = df.astype(convert_dict)

In [64]:
df.dtypes

id                      int64
innings                 int64
batting_team           object
bowling_team           object
overs                 float64
over                    int32
ball                    int32
total_runs              int64
player_dismissed        int32
total                   int64
total_score             int64
prev_30_runs            int32
prev_30_wickets         int32
total_wickets           int32
prev_30_dot_balls       int32
prev_30_boundaries      int32
total_balls             int32
run_rate                int32
prev_30_run_rate        int32
dtype: object

In [65]:
df.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate,prev_30_run_rate
101048,211028,1,England,Australia,0.1,0,1,0,0,179,0,0,0,0,1,0,1,0,0
101049,211028,1,England,Australia,0.2,0,2,1,0,179,1,1,0,0,1,0,2,3,1
101050,211028,1,England,Australia,0.3,0,3,0,0,179,1,1,0,0,2,0,3,2,1
101051,211028,1,England,Australia,0.4,0,4,0,0,179,1,1,0,0,3,0,4,1,1
101052,211028,1,England,Australia,0.5,0,5,0,0,179,1,1,0,0,4,0,5,1,1


In [66]:
df.to_csv('AllT20I_data_with_imp_features', index=None)

In [67]:
df_20i = pd.read_csv('AllT20I_data_with_imp_features')

In [68]:
df_20i.head()

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,prev_30_runs,prev_30_wickets,total_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,run_rate,prev_30_run_rate
0,211028,1,England,Australia,0.1,0,1,0,0,179,0,0,0,0,1,0,1,0,0
1,211028,1,England,Australia,0.2,0,2,1,0,179,1,1,0,0,1,0,2,3,1
2,211028,1,England,Australia,0.3,0,3,0,0,179,1,1,0,0,2,0,3,2,1
3,211028,1,England,Australia,0.4,0,4,0,0,179,1,1,0,0,3,0,4,1,1
4,211028,1,England,Australia,0.5,0,5,0,0,179,1,1,0,0,4,0,5,1,1


In [69]:
df_20i=pd.get_dummies(data=df_20i, columns=['batting_team', 'bowling_team'])

In [70]:
df_20i.columns

Index(['id', 'innings', 'overs', 'over', 'ball', 'total_runs',
       'player_dismissed', 'total', 'total_score', 'prev_30_runs',
       'prev_30_wickets', 'total_wickets', 'prev_30_dot_balls',
       'prev_30_boundaries', 'total_balls', 'run_rate', 'prev_30_run_rate',
       'batting_team_Australia', 'batting_team_Bangladesh',
       'batting_team_England', 'batting_team_India',
       'batting_team_New Zealand', 'batting_team_Pakistan',
       'batting_team_South Africa', 'batting_team_Sri Lanka',
       'batting_team_West Indies', 'bowling_team_Australia',
       'bowling_team_Bangladesh', 'bowling_team_England', 'bowling_team_India',
       'bowling_team_New Zealand', 'bowling_team_Pakistan',
       'bowling_team_South Africa', 'bowling_team_Sri Lanka',
       'bowling_team_West Indies'],
      dtype='object')

In [71]:
df_20i=df_20i[['id','batting_team_Australia','batting_team_Bangladesh', 'batting_team_England', 'batting_team_India','batting_team_New Zealand','batting_team_Pakistan', 
       'batting_team_South Africa', 'batting_team_Sri Lanka','batting_team_West Indies','bowling_team_Australia', 'bowling_team_Bangladesh', 'bowling_team_England',
        'bowling_team_India', 'bowling_team_New Zealand', 'bowling_team_Pakistan','bowling_team_South Africa', 'bowling_team_Sri Lanka','bowling_team_West Indies',
            'overs','total_score','total_wickets','prev_30_runs','prev_30_wickets','prev_30_dot_balls','prev_30_boundaries','total_balls','prev_30_run_rate','total']]

In [72]:
df_20i.head()

Unnamed: 0,id,batting_team_Australia,batting_team_Bangladesh,batting_team_England,batting_team_India,batting_team_New Zealand,batting_team_Pakistan,batting_team_South Africa,batting_team_Sri Lanka,batting_team_West Indies,...,overs,total_score,total_wickets,prev_30_runs,prev_30_wickets,prev_30_dot_balls,prev_30_boundaries,total_balls,prev_30_run_rate,total
0,211028,0,0,1,0,0,0,0,0,0,...,0.1,0,0,0,0,1,0,1,0,179
1,211028,0,0,1,0,0,0,0,0,0,...,0.2,1,0,1,0,1,0,2,1,179
2,211028,0,0,1,0,0,0,0,0,0,...,0.3,1,0,1,0,2,0,3,1,179
3,211028,0,0,1,0,0,0,0,0,0,...,0.4,1,0,1,0,3,0,4,1,179
4,211028,0,0,1,0,0,0,0,0,0,...,0.5,1,0,1,0,4,0,5,1,179


## Train_Test_Split 

In [73]:
X = df_20i.drop(labels=['total','id'], axis=1)
y = df_20i['total'].values

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=42,stratify=y)

In [75]:
X_train=X_train.values
X_test=X_test.values
X_train=np.asarray(X_train).astype(np.float32)
X_test=np.asarray(X_test).astype(np.float32)

In [76]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(95665, 27) (23917, 27) (95665,) (23917,)


# Linear Regression

In [77]:
LR = LinearRegression()
LR.fit(X_train,y_train)

LinearRegression()

In [78]:
filename = 'score_lr-model.pkl'
pickle.dump(LR, open(filename, 'wb'))

In [79]:
# Prediction on X_test 
pred=LR.predict(X_test)
mean_absolute_error (y_test, pred)

18.369715372615023

In [80]:
# Training Accuracy
pred_train = LR.predict(X_train)
accuracy_lr = r2_score(y_train,pred_train)
accuracy_lr

0.4396091164661825

In [81]:
# Testing Accuracy
pred_test = LR.predict(X_test)
accuracy_lm_test = r2_score(y_test,pred_test)
accuracy_lm_test

0.43248298574145294

In [82]:
def score_prediction(Bat_team,Bowl_team,overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate):
     
    temp_array = list()
    
    if Bat_team == 'AUS':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
    elif Bat_team == 'BAN':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
    elif Bat_team == 'ENG':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
    elif Bat_team == 'IND':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
    elif Bat_team == 'NZ':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
    elif Bat_team == 'PAK':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
    elif Bat_team == 'SA':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
    elif Bat_team == 'SL':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
    elif Bat_team == 'WI':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]
        
    if Bowl_team == 'AUS':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
    elif Bowl_team == 'BAN':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
    elif Bowl_team == 'ENG':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
    elif Bowl_team == 'IND':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
    elif Bowl_team == 'NZ':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
    elif Bowl_team == 'PAK':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
    elif Bowl_team == 'SA':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
    elif Bowl_team == 'SL':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
    elif Bowl_team == 'WI':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]
        
    temp_array = temp_array + [overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate]
    data = np.array([temp_array])
    my_prediction = int(LR.predict(data))
    
    print('Score Prediction with Linear Regression')
    print('Predicted score: ',my_prediction)
    print('Predicted score range: ',my_prediction -18, 'to' ,my_prediction + 18)


In [83]:
Bat_team = 'IND' 

Bowl_team = 'PAK' 

overs = 8.3

total_score = 99 # current score

total_wickets = 2 # current wicket 

run_rate = 12

prev_runs_30 = 70 # runs in prev 30 balls

prev_wickets_30 = 1 # wickets in prev 30 balls

prev_30_dot_balls = 3 # dots in prev 30 balls

prev_30_boundaries = 11 # boundaries in prev 30 balls

prev_30_run_rate = 11


score_prediction(Bat_team,Bowl_team,overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate)

Score Prediction with Linear Regression
Predicted score:  136
Predicted score range:  118 to 154


# Trying with another Regressor Models

# Decision Tree

In [84]:
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

DecisionTreeRegressor()

In [85]:
pred_train = dt_model.predict(X_train)
accuracy_dt = r2_score(y_train,pred_train)
accuracy_dt

0.9829371038839297

In [86]:
pred_test = dt_model.predict(X_test)
accuracy_dt_test = r2_score(y_test,pred_test)
accuracy_dt_test

0.714949766905262

In [87]:
def dt_score_prediction(Bat_team,Bowl_team,overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate):
     
    temp_array = list()
    
    if Bat_team == 'AUS':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
    elif Bat_team == 'BAN':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
    elif Bat_team == 'ENG':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
    elif Bat_team == 'IND':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
    elif Bat_team == 'NZ':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
    elif Bat_team == 'PAK':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
    elif Bat_team == 'SA':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
    elif Bat_team == 'SL':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
    elif Bat_team == 'WI':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]
        
    if Bowl_team == 'AUS':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
    elif Bowl_team == 'BAN':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
    elif Bowl_team == 'ENG':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
    elif Bowl_team == 'IND':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
    elif Bowl_team == 'NZ':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
    elif Bowl_team == 'PAK':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
    elif Bowl_team == 'SA':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
    elif Bowl_team == 'SL':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
    elif Bowl_team == 'WI':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]
        
    temp_array = temp_array + [overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate]
    data = np.array([temp_array])
    my_prediction = int(dt_model.predict(data))
    
    print('Score Prediction with Decision Tree')
    print('Predicted score : ',my_prediction)
    print('Predicted score range: ',my_prediction -18, 'to' ,my_prediction + 18)


In [88]:
Bat_team = 'IND' 

Bowl_team = 'PAK' 

overs = 8.3

total_score = 99 # current score

total_wickets = 2 # current wicket 

run_rate = 12

prev_runs_30 = 70 # runs in prev 30 balls

prev_wickets_30 = 1 # wickets in prev 30 balls

prev_30_dot_balls = 3 # dots in prev 30 balls

prev_30_boundaries = 11 # boundaries in prev 30 balls

prev_30_run_rate = 11


dt_score_prediction(Bat_team,Bowl_team,overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate)

Score Prediction with Decision Tree
Predicted score :  186
Predicted score range:  168 to 204


# Random Forest

In [89]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

RandomForestRegressor()

In [90]:
pred_train = rf.predict(X_train)
accuracy_rf = r2_score(y_train,pred_train)
accuracy_rf

0.9659944226921713

In [91]:
pred_test = rf.predict(X_test)
accuracy_rf_test = r2_score(y_test,pred_test)
accuracy_rf_test

0.8453983403711608

In [92]:
def rf_score_prediction(Bat_team,Bowl_team,overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate):
     
    temp_array = list()
    
    if Bat_team == 'AUS':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
    elif Bat_team == 'BAN':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
    elif Bat_team == 'ENG':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
    elif Bat_team == 'IND':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
    elif Bat_team == 'NZ':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
    elif Bat_team == 'PAK':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
    elif Bat_team == 'SA':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
    elif Bat_team == 'SL':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
    elif Bat_team == 'WI':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]
        
    if Bowl_team == 'AUS':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
    elif Bowl_team == 'BAN':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
    elif Bowl_team == 'ENG':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
    elif Bowl_team == 'IND':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
    elif Bowl_team == 'NZ':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
    elif Bowl_team == 'PAK':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
    elif Bowl_team == 'SA':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
    elif Bowl_team == 'SL':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
    elif Bowl_team == 'WI':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]
        
    temp_array = temp_array + [overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate]
    data = np.array([temp_array])
    my_prediction = int(rf.predict(data))
    
    print('Score Prediction with Random Forest')
    print('Predicted score: ',my_prediction)
    print('Predicted score range: ',my_prediction -18, 'to' ,my_prediction + 18)


In [93]:
Bat_team = 'IND' 

Bowl_team = 'PAK' 

overs = 8.3

total_score = 99 # current score

total_wickets = 2 # current wicket 

run_rate = 12

prev_runs_30 = 70 # runs in prev 30 balls

prev_wickets_30 = 1 # wickets in prev 30 balls

prev_30_dot_balls = 3 # dots in prev 30 balls

prev_30_boundaries = 11 # boundaries in prev 30 balls

prev_30_run_rate = 11


rf_score_prediction(Bat_team,Bowl_team,overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate)

Score Prediction with Random Forest
Predicted score:  170
Predicted score range:  152 to 188


# XGBoost Regressor

In [94]:
#!pip install xgboost

In [95]:
from xgboost import XGBRegressor

In [96]:
xgb = XGBRegressor()
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.300000012, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [97]:
pred_train = xgb.predict(X_train)
accuracy_rf = r2_score(y_train,pred_train)
accuracy_rf

0.6692801355694464

In [98]:
pred_test = xgb.predict(X_test)
accuracy_rf_test = r2_score(y_test,pred_test)
accuracy_rf_test

0.6275373818038108

In [99]:
def xg_score_prediction(Bat_team,Bowl_team,overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate):
     
    temp_array = list()
    
    if Bat_team == 'AUS':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
    elif Bat_team == 'BAN':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
    elif Bat_team == 'ENG':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
    elif Bat_team == 'IND':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
    elif Bat_team == 'NZ':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
    elif Bat_team == 'PAK':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
    elif Bat_team == 'SA':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
    elif Bat_team == 'SL':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
    elif Bat_team == 'WI':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]
        
    if Bowl_team == 'AUS':
        temp_array = temp_array + [1,0,0,0,0,0,0,0,0]
    elif Bowl_team == 'BAN':
        temp_array = temp_array + [0,1,0,0,0,0,0,0,0]
    elif Bowl_team == 'ENG':
        temp_array = temp_array + [0,0,1,0,0,0,0,0,0]
    elif Bowl_team == 'IND':
        temp_array = temp_array + [0,0,0,1,0,0,0,0,0]
    elif Bowl_team == 'NZ':
        temp_array = temp_array + [0,0,0,0,1,0,0,0,0]
    elif Bowl_team == 'PAK':
        temp_array = temp_array + [0,0,0,0,0,1,0,0,0]
    elif Bowl_team == 'SA':
        temp_array = temp_array + [0,0,0,0,0,0,1,0,0]
    elif Bowl_team == 'SL':
        temp_array = temp_array + [0,0,0,0,0,0,0,1,0]
    elif Bowl_team == 'WI':
        temp_array = temp_array + [0,0,0,0,0,0,0,0,1]
        
    temp_array = temp_array + [overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate]
    data = np.array([temp_array])
    my_prediction = int(xgb.predict(data))
    
    print('Score Prediction with XGBoost')
    print('Predicted score: ',my_prediction)
    print('Predicted score range: ',my_prediction -18, 'to' ,my_prediction + 18)


In [100]:
Bat_team = 'IND' 

Bowl_team = 'PAK' 

overs = 8.3

total_score = 99 # current score

total_wickets = 2 # current wicket 

run_rate = 12

prev_runs_30 = 70 # runs in prev 30 balls

prev_wickets_30 = 1 # wickets in prev 30 balls

prev_30_dot_balls = 3 # dots in prev 30 balls

prev_30_boundaries = 11 # boundaries in prev 30 balls

prev_30_run_rate = 11


xg_score_prediction(Bat_team,Bowl_team,overs,total_score,total_wickets,run_rate,prev_runs_30,prev_wickets_30,prev_30_dot_balls,prev_30_boundaries,prev_30_run_rate)

Score Prediction with XGBoost
Predicted score:  181
Predicted score range:  163 to 199
