## Importing the essential libraries over here

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [14]:
data=pd.read_csv("TEST_batting.csv")

In [15]:
data.head()

Unnamed: 0,id,span,matches,innings,not_out,runs,high_score,average_score,ball_faced,strike_rate,100s,50,0s,4s,6s
0,420402,2018-2021,5,8,1,172,48*,24.57,412,41.74,0,0,1,17,2
1,921509,2019-2021,4,8,0,356,87,44.5,769,46.29,0,3,0,41,4
2,440970,2018-2021,5,10,4,353,200*,58.83,803,43.96,1,1,0,43,1
3,703323,2019-2019,3,6,1,110,65*,22.0,253,43.47,0,1,0,13,2
4,25913,2018-2019,3,6,0,33,24,5.5,68,48.52,0,0,3,4,1


## Taking care of missing values if present over here

In [16]:
data.isnull().sum()

id               0
span             7
matches          0
innings          0
not_out          0
runs             0
high_score       0
average_score    2
ball_faced       0
strike_rate      0
100s             0
50               0
0s               0
4s               0
6s               0
dtype: int64

In [17]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

span
average_score


## Dropping all the missing values over here

In [18]:
data.dropna(inplace=True)

In [19]:
data.isnull().sum()

id               0
span             0
matches          0
innings          0
not_out          0
runs             0
high_score       0
average_score    0
ball_faced       0
strike_rate      0
100s             0
50               0
0s               0
4s               0
6s               0
dtype: int64

## Taking care of duplicate observations if there will be any over here

In [20]:
data.duplicated().sum()

0

## Filtering all the numerical features over here using list comprehension

In [21]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

id
matches
innings
not_out
runs
average_score
ball_faced
strike_rate
100s
50
0s
4s
6s


In [22]:
data[numerical_features]

Unnamed: 0,id,matches,innings,not_out,runs,average_score,ball_faced,strike_rate,100s,50,0s,4s,6s
0,420402,5,8,1,172,24.57,412,41.74,0,0,1,17,2
1,921509,4,8,0,356,44.50,769,46.29,0,3,0,41,4
2,440970,5,10,4,353,58.83,803,43.96,1,1,0,43,1
3,703323,3,6,1,110,22.00,253,43.47,0,1,0,13,2
4,25913,3,6,0,33,5.50,68,48.52,0,0,3,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,914467,2,3,1,53,26.50,103,51.45,0,0,0,7,1
253,450101,5,8,1,200,28.57,478,41.84,0,2,0,24,2
254,230553,75,118,24,1101,11.71,2892,38.07,0,0,19,132,13
255,1199304,10,15,8,41,5.85,182,22.52,0,0,2,4,1


## First of all we need to clean one column called as high_score over here

In [25]:
data['high_score']=data['high_score'].str.replace("*","")

In [28]:
data['high_score']=data['high_score'].astype("int64")

## Filtering all the categorical features over here

In [29]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

span


In [30]:
data[cat_features]

Unnamed: 0,span
0,2018-2021
1,2019-2021
2,2018-2021
3,2019-2019
4,2018-2019
...,...
252,2022-2022
253,2017-2023
254,2009-2023
255,2021-2022


## Encoding the categorical feature into the numerical values over here

In [31]:
span_mapping={}
for span,index in enumerate(data['span'].unique()):
  span_mapping[index]=span
data['span']=data['span'].map(span_mapping)

In [32]:
print(span_mapping)

{'2018-2021': 0, '2019-2021': 1, '2019-2019': 2, '2018-2019': 3, '2018-2018': 4, '2017-2023': 5, '2011-2022': 6, '2022-2023': 7, '2011-2016': 8, '2021-2023': 9, '2016-2022': 10, '2015-2023': 11, '2009-2016': 12, '2020-2022': 13, '2014-2023': 14, '2014-2017': 15, '2020-2020': 16, '2019-2023': 17, '2016-2023': 18, '2018-2023': 19, '2013-2022': 20, '2019-2022': 21, '2015-2022': 22, '2009-2021': 23, '2022-2022': 24, '2017-2022': 25, '2005-2022': 26, '2016-2018': 27, '2017-2021': 28, '2011-2021': 29, '2013-2018': 30, '2011-2023': 31, '2023-2023': 32, '2021-2021': 33, '2013-2014': 34, '2018-2022': 35, '2021-2022': 36, '2012-2023': 37, '2004-2018': 38, '2013-2023': 39, '2020-2023': 40, '2017-2018': 41, '2015-2021': 42, '2014-2022': 43, '2020-2021': 44, '2012-2022': 45, '2010-2022': 46, '2009-2022': 47, '2014-2021': 48, '2003-2022': 49, '2007-2022': 50, '2016-2017': 51, '2015-2016': 52, '2014-2015': 53, '2014-2019': 54, '2013-2017': 55, '2010-2017': 56, '2019-2020': 57, '2010-2023': 58, '2008-

## We need to treat the id as well as a categorical feature over here

In [33]:
data['id']

0       420402
1       921509
2       440970
3       703323
4        25913
        ...   
252     914467
253     450101
254     230553
255    1199304
256     248920
Name: id, Length: 248, dtype: int64

In [34]:
id_mapping={}
for id,index in enumerate(data['id'].unique()):
  id_mapping[index]=id
data['id']=data['id'].map(id_mapping)

In [35]:
data

Unnamed: 0,id,span,matches,innings,not_out,runs,high_score,average_score,ball_faced,strike_rate,100s,50,0s,4s,6s
0,0,0,5,8,1,172,48,24.57,412,41.74,0,0,1,17,2
1,1,1,4,8,0,356,87,44.50,769,46.29,0,3,0,41,4
2,2,0,5,10,4,353,200,58.83,803,43.96,1,1,0,43,1
3,3,2,3,6,1,110,65,22.00,253,43.47,0,1,0,13,2
4,4,3,3,6,0,33,24,5.50,68,48.52,0,0,3,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,243,24,2,3,1,53,43,26.50,103,51.45,0,0,0,7,1
253,244,5,5,8,1,200,58,28.57,478,41.84,0,2,0,24,2
254,245,68,75,118,24,1101,41,11.71,2892,38.07,0,0,19,132,13
255,246,36,10,15,8,41,13,5.85,182,22.52,0,0,2,4,1


## Creating the features and labels over here

In [36]:
data['strike rate']=data['strike_rate']

In [37]:
data.drop("strike_rate",axis=1,inplace=True)

In [40]:
data

Unnamed: 0,id,span,matches,innings,not_out,runs,high_score,average_score,ball_faced,100s,50,0s,4s,6s,strike rate
0,0,0,5,8,1,172,48,24.57,412,0,0,1,17,2,41.74
1,1,1,4,8,0,356,87,44.50,769,0,3,0,41,4,46.29
2,2,0,5,10,4,353,200,58.83,803,1,1,0,43,1,43.96
3,3,2,3,6,1,110,65,22.00,253,0,1,0,13,2,43.47
4,4,3,3,6,0,33,24,5.50,68,0,0,3,4,1,48.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252,243,24,2,3,1,53,43,26.50,103,0,0,0,7,1,51.45
253,244,5,5,8,1,200,58,28.57,478,0,2,0,24,2,41.84
254,245,68,75,118,24,1101,41,11.71,2892,0,0,19,132,13,38.07
255,246,36,10,15,8,41,13,5.85,182,0,0,2,4,1,22.52


In [38]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [42]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Experinment with different algorithms at the end selecting the one giving the maximum accuracy with respect to the performance metrics over here

In [44]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [45]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

regressors = {
    "Random Forest": RandomForestRegressor(),
    "Support Vector Machine": SVR(kernel='linear'),
    "Linear Regression": LinearRegression(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    # "Decision Tree": DecisionTreeRegressor(),
    "Bayesian Ridge": BayesianRidge(),
    "XGBoost": XGBRegressor(),
    "CatBoost": CatBoostRegressor(),
    "Ridge": Ridge(),
    "Lasso": Lasso()
}

results = {}
for name, regressor in regressors.items():
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = mse

best_regressor = min(results, key=results.get)
best_mse = results[best_regressor]

print("Best Regressor:", best_regressor)
print("Mean Squared Error:", best_mse)

Learning rate set to 0.0317
0:	learn: 14.8562871	total: 48.2ms	remaining: 48.2s
1:	learn: 14.7357862	total: 51.9ms	remaining: 25.9s
2:	learn: 14.6492019	total: 53.6ms	remaining: 17.8s
3:	learn: 14.5216778	total: 55.3ms	remaining: 13.8s
4:	learn: 14.4037208	total: 56.9ms	remaining: 11.3s
5:	learn: 14.2937707	total: 58.6ms	remaining: 9.71s
6:	learn: 14.2119209	total: 60.3ms	remaining: 8.55s
7:	learn: 14.1066910	total: 61.9ms	remaining: 7.68s
8:	learn: 14.0093556	total: 63.6ms	remaining: 7s
9:	learn: 13.9291110	total: 65.2ms	remaining: 6.45s
10:	learn: 13.8452879	total: 66.9ms	remaining: 6.01s
11:	learn: 13.7512658	total: 68.5ms	remaining: 5.64s
12:	learn: 13.6798982	total: 70.1ms	remaining: 5.32s
13:	learn: 13.5856102	total: 71.8ms	remaining: 5.05s
14:	learn: 13.5218612	total: 73.4ms	remaining: 4.82s
15:	learn: 13.4529880	total: 75ms	remaining: 4.61s
16:	learn: 13.3887017	total: 76.6ms	remaining: 4.43s
17:	learn: 13.3070478	total: 78.2ms	remaining: 4.26s
18:	learn: 13.2133719	total: 79.8

  model = cd_fast.enet_coordinate_descent(


## Training the model on the training set with the best algorithm over here

In [46]:
regressor=XGBRegressor()
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing set over here

In [47]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[ 51.98  33.33]
 [ 48.07  51.42]
 [ 62.08  73.63]
 [ 57.8   69.2 ]
 [ 47.81  48.3 ]
 [ 40.24  50.32]
 [ 22.04  34.28]
 [ 45.99  54.75]
 [ 74.87 100.  ]
 [ 49.16  39.44]
 [ 49.92  43.8 ]
 [ 52.93  50.35]
 [ 56.42  50.6 ]
 [ 48.79  46.17]
 [ 49.06  46.58]
 [ 51.83  71.18]
 [ 42.29  38.95]
 [ 47.54  60.35]
 [ 45.61  44.06]
 [ 54.76  55.2 ]
 [ 54.84  41.66]
 [ 33.02  41.02]
 [ 38.72  35.29]
 [ 51.45  50.05]
 [ 51.96  47.63]
 [ 44.32  40.74]
 [ 54.01  54.97]
 [ 57.05  56.25]
 [ 52.79  54.44]
 [ 51.96  47.87]
 [ 46.31  44.98]
 [ 53.52  79.69]
 [ 46.13  47.05]
 [ 45.6   46.59]
 [ 45.05  46.24]
 [ 92.27 105.88]
 [ 56.81  52.21]
 [ 64.03  77.77]
 [ 53.23  58.94]
 [ 34.33  36.13]
 [ 55.82  81.35]
 [ 54.17  66.28]
 [ 43.54  42.63]
 [ 48.96  41.47]
 [ 48.78  51.46]
 [ 49.72  57.62]
 [ 57.46  55.44]
 [ 55.19  47.73]
 [ 65.4   73.88]
 [ 41.93  49.81]]


In [50]:
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(actual_vs_predicted)

    Actual  Predicted
0    33.33  51.984371
1    51.42  48.070675
2    73.63  62.081814
3    69.20  57.799168
4    48.30  47.811543
5    50.32  40.239033
6    34.28  22.044365
7    54.75  45.992661
8   100.00  74.871445
9    39.44  49.163517
10   43.80  49.917625
11   50.35  52.926132
12   50.60  56.416313
13   46.17  48.786373
14   46.58  49.056347
15   71.18  51.834503
16   38.95  42.286850
17   60.35  47.541122
18   44.06  45.610947
19   55.20  54.755348
20   41.66  54.837936
21   41.02  33.017811
22   35.29  38.723019
23   50.05  51.450539
24   47.63  51.961987
25   40.74  44.316048
26   54.97  54.005417
27   56.25  57.054897
28   54.44  52.790421
29   47.87  51.964596
30   44.98  46.305492
31   79.69  53.524269
32   47.05  46.126293
33   46.59  45.595604
34   46.24  45.054527
35  105.88  92.270782
36   52.21  56.805916
37   77.77  64.026382
38   58.94  53.230244
39   36.13  34.326073
40   81.35  55.816486
41   66.28  54.174801
42   42.63  43.541035
43   41.47  48.955181
44   51.46

In [51]:
actual_vs_predicted['Absolute Difference'] = abs(actual_vs_predicted['Actual'] - actual_vs_predicted['Predicted'])


In [52]:
actual_vs_predicted

Unnamed: 0,Actual,Predicted,Absolute Difference
0,33.33,51.984371,18.654371
1,51.42,48.070675,3.349325
2,73.63,62.081814,11.548186
3,69.2,57.799168,11.400832
4,48.3,47.811543,0.488457
5,50.32,40.239033,10.080967
6,34.28,22.044365,12.235635
7,54.75,45.992661,8.757339
8,100.0,74.871445,25.128555
9,39.44,49.163517,9.723517
