## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.float_format', lambda x: "{:.6f}".format(x) if not pd.isna(x) else "NaN")

## Importing the dataset over here

In [2]:
data=pd.read_csv("t20_batting.csv")

In [3]:
data.head()

Unnamed: 0,id,span,matches,innings,not_out,runs,high_score,average_score,ball_faced,strike_rate,100s,50,0s,4s,6s
0,420402,2013-2022,2,2.0,0.0,33.0,24,16.5,45.0,73.33,0.0,0.0,0.0,1.0,1.0
1,921509,2019-2022,16,16.0,3.0,381.0,64*,29.3,361.0,105.54,0.0,1.0,0.0,30.0,10.0
2,568136,2016-2022,22,3.0,3.0,27.0,24*,,24.0,112.5,0.0,0.0,0.0,4.0,0.0
3,352048,2012-2022,55,45.0,13.0,615.0,56*,19.21,509.0,120.82,0.0,1.0,3.0,45.0,27.0
4,440970,2013-2022,6,4.0,2.0,48.0,36,24.0,55.0,87.27,0.0,0.0,0.0,5.0,0.0


In [4]:
data.shape

(327, 15)

In this particular usecase I want to predict the runs over here

## Taking care of duplicate observations if present over here

In [5]:
data.drop_duplicates(inplace=True)

## Taking care of missing values if present over here

In [6]:
data.isnull().sum()

id                0
span             17
matches           0
innings          10
not_out          10
runs             10
high_score       10
average_score    26
ball_faced       10
strike_rate      10
100s             10
50               10
0s               10
4s               10
6s               10
dtype: int64

In [7]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

span
innings
not_out
runs
high_score
average_score
ball_faced
strike_rate
100s
50
0s
4s
6s


In [8]:
data[missing_values]

Unnamed: 0,span,innings,not_out,runs,high_score,average_score,ball_faced,strike_rate,100s,50,0s,4s,6s
0,2013-2022,2.000000,0.000000,33.000000,24,16.500000,45.000000,73.330000,0.000000,0.000000,0.000000,1.000000,1.000000
1,2019-2022,16.000000,3.000000,381.000000,64*,29.300000,361.000000,105.540000,0.000000,1.000000,0.000000,30.000000,10.000000
2,2016-2022,3.000000,3.000000,27.000000,24*,,24.000000,112.500000,0.000000,0.000000,0.000000,4.000000,0.000000
3,2012-2022,45.000000,13.000000,615.000000,56*,19.210000,509.000000,120.820000,0.000000,1.000000,3.000000,45.000000,27.000000
4,2013-2022,4.000000,2.000000,48.000000,36,24.000000,55.000000,87.270000,0.000000,0.000000,0.000000,5.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,2020-2022,9.000000,4.000000,198.000000,44*,39.600000,133.000000,148.870000,0.000000,0.000000,0.000000,7.000000,17.000000
323,2021-2021,2.000000,1.000000,3.000000,3,3.000000,9.000000,33.330000,0.000000,0.000000,0.000000,0.000000,0.000000
324,2018-2022,17.000000,5.000000,162.000000,27*,13.500000,116.000000,139.650000,0.000000,0.000000,2.000000,12.000000,10.000000
325,2009-2022,8.000000,2.000000,51.000000,31*,8.500000,54.000000,94.440000,0.000000,0.000000,0.000000,4.000000,2.000000


## Filtering only the numerical values over here

In [9]:
numeric_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numeric_features:
  print(feature)

id
matches
innings
not_out
runs
average_score
ball_faced
strike_rate
100s
50
0s
4s
6s


In [10]:
data[numeric_features]

Unnamed: 0,id,matches,innings,not_out,runs,average_score,ball_faced,strike_rate,100s,50,0s,4s,6s
0,420402,2,2.000000,0.000000,33.000000,16.500000,45.000000,73.330000,0.000000,0.000000,0.000000,1.000000,1.000000
1,921509,16,16.000000,3.000000,381.000000,29.300000,361.000000,105.540000,0.000000,1.000000,0.000000,30.000000,10.000000
2,568136,22,3.000000,3.000000,27.000000,,24.000000,112.500000,0.000000,0.000000,0.000000,4.000000,0.000000
3,352048,55,45.000000,13.000000,615.000000,19.210000,509.000000,120.820000,0.000000,1.000000,3.000000,45.000000,27.000000
4,440970,6,4.000000,2.000000,48.000000,24.000000,55.000000,87.270000,0.000000,0.000000,0.000000,5.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,677077,23,9.000000,4.000000,198.000000,39.600000,133.000000,148.870000,0.000000,0.000000,0.000000,7.000000,17.000000
323,1206110,6,2.000000,1.000000,3.000000,3.000000,9.000000,33.330000,0.000000,0.000000,0.000000,0.000000,0.000000
324,820691,24,17.000000,5.000000,162.000000,13.500000,116.000000,139.650000,0.000000,0.000000,2.000000,12.000000,10.000000
325,248920,12,8.000000,2.000000,51.000000,8.500000,54.000000,94.440000,0.000000,0.000000,0.000000,4.000000,2.000000


## Filtering only the categorical features over here

In [11]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

span
high_score


In [12]:
data[cat_features]

Unnamed: 0,span,high_score
0,2013-2022,24
1,2019-2022,64*
2,2016-2022,24*
3,2012-2022,56*
4,2013-2022,36
...,...,...
322,2020-2022,44*
323,2021-2021,3
324,2018-2022,27*
325,2009-2022,31*


## Observation: that the high score is not a categorial variable over here so therefore we need to preprocess it

In [13]:
data['high_score']=data['high_score'].str.replace("*","")

In [14]:
data['high_score']=pd.to_numeric(data['high_score'])

In [15]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

span


In [16]:
data[cat_features]

Unnamed: 0,span
0,2013-2022
1,2019-2022
2,2016-2022
3,2012-2022
4,2013-2022
...,...
322,2020-2022
323,2021-2021
324,2018-2022
325,2009-2022


## Dropping all the missing values over here

In [17]:
data.dropna(inplace=True)

In [18]:
data.drop("id",axis=1,inplace=True)

## Over here the feature having name Id does not make any sense so therefore we are directly dropping them

In [19]:
data['span'].value_counts()

span
2016-2022    20
2019-2022    20
2018-2022    17
2015-2022    17
2017-2022    16
2022-2022    16
2014-2022    14
2021-2022    12
2021-2023    11
2011-2022    11
2019-2023    10
2015-2023     9
2012-2022     9
2009-2022     8
2010-2022     8
2020-2022     8
2021-2021     6
2013-2022     6
2017-2023     5
2019-2021     5
2018-2021     4
2016-2023     4
2008-2022     3
2019-2019     3
2017-2021     3
2020-2023     3
2006-2022     3
2010-2023     3
2006-2020     2
2012-2021     2
2022-2023     2
2017-2017     2
2023-2023     2
2011-2021     2
2014-2023     2
2007-2022     2
2016-2021     2
2012-2016     1
2020-2020     1
2010-2021     1
2012-2019     1
2006-2023     1
2007-2020     1
2018-2018     1
2013-2019     1
2007-2010     1
2020-2021     1
2016-2016     1
2007-2009     1
2006-2014     1
2009-2021     1
2011-2019     1
2014-2017     1
2018-2023     1
2012-2014     1
2011-2012     1
Name: count, dtype: int64

In [20]:
span_mapping={}

# Iterate over the lsit of spans and assign a sequential number using for loop
for index,span in enumerate(['2016-2022','2019-2022','2018-2022','2015-2022','2017-2022', '2022-2022','2014-2022','2021-2022','2021-2023',
 '2011-2022',
 '2019-2023',
 '2015-2023',
 '2012-2022',
 '2009-2022',
 '2010-2022',
 '2020-2022',
 '2021-2021',
 '2013-2022',
 '2017-2023',
 '2019-2021',
 '2018-2021',
 '2016-2023',
 '2008-2022',
 '2019-2019',
 '2017-2021',
 '2020-2023',
 '2006-2022',
 '2010-2023',
 '2006-2020',
 '2012-2021',
 '2022-2023',
 '2017-2017',
 '2023-2023',
 '2011-2021',
 '2014-2023',
 '2007-2022',
 '2016-2021',
 '2012-2016',
 '2020-2020',
 '2010-2021',
 '2012-2019',
 '2006-2023',
 '2007-2020',
 '2018-2018',
 '2013-2019',
 '2007-2010',
 '2020-2021',
 '2016-2016',
 '2007-2009',
 '2006-2014',
 '2009-2021',
 '2011-2019',
 '2014-2017',
 '2018-2023',
 '2012-2014',
 '2011-2012']):
    span_mapping[span]=index
print(span_mapping)

{'2016-2022': 0, '2019-2022': 1, '2018-2022': 2, '2015-2022': 3, '2017-2022': 4, '2022-2022': 5, '2014-2022': 6, '2021-2022': 7, '2021-2023': 8, '2011-2022': 9, '2019-2023': 10, '2015-2023': 11, '2012-2022': 12, '2009-2022': 13, '2010-2022': 14, '2020-2022': 15, '2021-2021': 16, '2013-2022': 17, '2017-2023': 18, '2019-2021': 19, '2018-2021': 20, '2016-2023': 21, '2008-2022': 22, '2019-2019': 23, '2017-2021': 24, '2020-2023': 25, '2006-2022': 26, '2010-2023': 27, '2006-2020': 28, '2012-2021': 29, '2022-2023': 30, '2017-2017': 31, '2023-2023': 32, '2011-2021': 33, '2014-2023': 34, '2007-2022': 35, '2016-2021': 36, '2012-2016': 37, '2020-2020': 38, '2010-2021': 39, '2012-2019': 40, '2006-2023': 41, '2007-2020': 42, '2018-2018': 43, '2013-2019': 44, '2007-2010': 45, '2020-2021': 46, '2016-2016': 47, '2007-2009': 48, '2006-2014': 49, '2009-2021': 50, '2011-2019': 51, '2014-2017': 52, '2018-2023': 53, '2012-2014': 54, '2011-2012': 55}


In [21]:
data['span']=data['span'].map(span_mapping)

## Creating the features and labels over here

In [22]:
data.columns

Index(['span', 'matches', 'innings', 'not_out', 'runs', 'high_score',
       'average_score', 'ball_faced', 'strike_rate', '100s', '50', '0s', '4s',
       '6s'],
      dtype='object')

In [23]:
X=data.drop(['runs'],axis=1).values
y=data['runs'].values

In [24]:
X

array([[17.,  2.,  2., ...,  0.,  1.,  1.],
       [ 1., 16., 16., ...,  0., 30., 10.],
       [12., 55., 45., ...,  3., 45., 27.],
       ...,
       [ 2., 24., 17., ...,  2., 12., 10.],
       [13., 12.,  8., ...,  0.,  4.,  2.],
       [ 1., 38., 17., ...,  2.,  9.,  5.]])

## Splitting the dataset into training set and testing set over here

In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here

In [26]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor()
regressor.fit(X_train,y_train)

## Predicting the results on the testing dataset over here

In [27]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))

[[2.16e+02 2.21e+02]
 [8.90e+02 8.63e+02]
 [2.24e+03 2.17e+03]
 [2.15e+03 1.92e+03]
 [2.00e+00 5.15e+00]
 [3.85e+02 3.99e+02]
 [6.82e+02 6.61e+02]
 [2.20e+02 2.25e+02]
 [1.04e+03 9.13e+02]
 [8.15e+02 1.01e+03]
 [1.00e+00 4.15e+00]
 [3.28e+02 3.09e+02]
 [1.39e+02 1.35e+02]
 [3.85e+02 3.81e+02]
 [8.00e+00 1.10e+01]
 [1.80e+01 1.76e+01]
 [1.04e+03 1.05e+03]
 [1.13e+02 1.16e+02]
 [1.20e+01 1.45e+01]
 [2.70e+01 3.35e+01]
 [5.30e+02 5.77e+02]
 [2.91e+02 3.10e+02]
 [1.37e+03 1.37e+03]
 [1.52e+03 1.43e+03]
 [2.89e+03 3.00e+03]
 [7.97e+02 8.16e+02]
 [0.00e+00 2.51e+00]
 [2.41e+02 2.40e+02]
 [2.90e+01 3.08e+01]
 [2.41e+02 2.34e+02]
 [3.95e+02 4.01e+02]
 [1.00e+01 1.01e+01]
 [1.49e+02 1.77e+02]
 [9.40e+01 9.62e+01]
 [2.64e+03 2.64e+03]
 [4.30e+01 4.08e+01]
 [1.07e+02 1.25e+02]
 [1.33e+03 1.32e+03]
 [1.48e+03 1.40e+03]
 [8.03e+02 6.84e+02]
 [9.77e+02 1.04e+03]
 [3.62e+02 3.89e+02]
 [1.91e+03 1.87e+03]
 [3.68e+02 3.58e+02]
 [8.26e+02 8.48e+02]
 [5.00e+00 6.41e+00]
 [6.35e+02 6.70e+02]
 [8.70e+01 7.

In [28]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9927224874912746

In [30]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

61.30989376015388

In [32]:
rmse

61.30989376015388

In [31]:
mse

3758.9030728813555

In [33]:
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(actual_vs_predicted)

        Actual   Predicted
0   216.000000  220.870000
1   890.000000  862.790000
2  2243.000000 2174.420000
3  2147.000000 1918.500000
4     2.000000    5.150000
5   385.000000  398.830000
6   682.000000  661.000000
7   220.000000  225.250000
8  1044.000000  912.930000
9   815.000000 1005.670000
10    1.000000    4.150000
11  328.000000  308.630000
12  139.000000  135.350000
13  385.000000  381.310000
14    8.000000   11.030000
15   18.000000   17.620000
16 1044.000000 1045.460000
17  113.000000  116.390000
18   12.000000   14.470000
19   27.000000   33.510000
20  530.000000  576.550000
21  291.000000  310.220000
22 1369.000000 1373.300000
23 1522.000000 1428.090000
24 2894.000000 3001.040000
25  797.000000  816.300000
26    0.000000    2.510000
27  241.000000  240.370000
28   29.000000   30.780000
29  241.000000  233.970000
30  395.000000  401.430000
31   10.000000   10.080000
32  149.000000  176.920000
33   94.000000   96.160000
34 2635.000000 2644.430000
35   43.000000   40.780000
3

In [34]:
actual_vs_predicted['Absolute Difference'] = abs(actual_vs_predicted['Actual'] - actual_vs_predicted['Predicted'])

In [35]:
actual_vs_predicted

Unnamed: 0,Actual,Predicted,Absolute Difference
0,216.0,220.87,4.87
1,890.0,862.79,27.21
2,2243.0,2174.42,68.58
3,2147.0,1918.5,228.5
4,2.0,5.15,3.15
5,385.0,398.83,13.83
6,682.0,661.0,21.0
7,220.0,225.25,5.25
8,1044.0,912.93,131.07
9,815.0,1005.67,190.67
