In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as pandas
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error


In [3]:
import xgboost
xgboost.__version__

'1.4.2'

In [6]:
#Reading the dataset
df_t20 = pickle.load(open('prepared_dataset.pkl','rb'))

In [7]:
df_t20.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72573 entries, 30357 to 37978
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   batting_team   72573 non-null  object 
 1   bowling_team   72573 non-null  object 
 2   current_score  72573 non-null  int64  
 3   wicket_left    72573 non-null  int32  
 4   curr_rr        72573 non-null  float64
 5   city           72573 non-null  object 
 6   ball_left      72573 non-null  int64  
 7   last_five      72573 non-null  float64
 8   runs_x         72573 non-null  int64  
dtypes: float64(2), int32(1), int64(3), object(3)
memory usage: 5.3+ MB


In [8]:
df_t20.describe()

Unnamed: 0,current_score,wicket_left,curr_rr,ball_left,last_five,runs_x
count,72573.0,72573.0,72573.0,72573.0,72573.0,72573.0
mean,90.66409,6.647197,7.309305,46.060422,37.037714,154.090089
std,41.115685,2.041537,1.806423,26.534785,11.977767,34.862817
min,8.0,0.0,1.6,0.0,3.0,39.0
25%,58.0,5.0,6.107143,23.0,28.0,132.0
50%,86.0,7.0,7.252174,46.0,36.0,153.0
75%,118.0,8.0,8.465753,69.0,44.0,178.0
max,263.0,10.0,16.6,98.0,89.0,263.0


In [9]:
#splitting input feature and target feature
X = df_t20.drop(columns=['runs_x'])
y = df_t20['runs_x']

In [10]:
#Splitting train and test dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [11]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((58058, 8), (14515, 8), (58058,), (14515,))

In [12]:
X_train

Unnamed: 0,batting_team,bowling_team,current_score,wicket_left,curr_rr,city,ball_left,last_five
17784,Kenya,Ghana,95,7,6.129032,Kampala,27,39.0
15191,West Indies,India,62,4,4.325581,Kolkata,34,15.0
3044,England,Australia,71,6,6.173913,Melbourne,51,26.0
48128,Australia,West Indies,110,6,7.500000,London,32,50.0
4137,United Arab Emirates,Afghanistan,71,8,9.906977,Abu Dhabi,77,39.0
...,...,...,...,...,...,...,...,...
38385,United Arab Emirates,Saudi Arabia,102,7,8.052632,Al Amarat,44,39.0
86586,England,Pakistan,62,8,7.018868,Dubai,67,30.0
10526,India,Ireland,185,9,10.277778,Dublin,12,63.0
26659,Oman,Ireland,130,6,8.041237,Al Amarat,23,49.0


In [13]:
#Convert the Object datatype into integer which model can understand
colm_transf = ColumnTransformer([
    ('colm_transf',OneHotEncoder(sparse=False,drop='first'),['batting_team','bowling_team','city'])
],remainder='passthrough')

In [14]:
#Create the pipeline to perform some task in sequence (OneHotEncoding,StandardScaler,Model training)
pipe = Pipeline(steps=[
    ('step1',colm_transf),
    ('step2',StandardScaler()),
    ('step3',XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=1))
])

In [15]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)

In [16]:
print(r2_score(y_test,y_pred))

0.9904099450119112


<font color='red'>Higher the r2_score,good the model and model has the good fit

In [17]:
mean_absolute_error(y_test,y_pred)

1.6488866103668687

In [18]:
pickle.dump(pipe,open('pipe.pkl','wb'))