In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("t20i_info.csv")

Checking the dataset

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,0,2,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground
1,1,2,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground
2,2,2,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground
3,3,2,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground
4,4,2,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground


Checking if there is any null value present in the dataset or not

In [4]:
df.isnull().sum()

Unnamed: 0             0
match_id               0
batting_team           0
bowling_team           0
ball                   0
runs                   0
player_dismissed       0
city                8548
venue                  0
dtype: int64

In [5]:
df.shape

(63888, 9)

Needed features for score prediction:
Batting team,
bowling team,
city,
current score,
balls left,
wicket left,
current run rate,
last five/six-> powerplay

Extracting cricket city name from cricket stadium

In [6]:
df[df['city'].isnull()]['venue'][0].split(" ")[0]

'Melbourne'

Filling the null value in city with extracted value

In [7]:
df['city'] = df['city'].fillna(df['venue'].apply(lambda x: x.split(' ')[0]))

Printing the updated dataset with city names

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,Melbourne Cricket Ground
1,1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,Melbourne Cricket Ground
2,2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,Melbourne Cricket Ground
3,3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,Melbourne Cricket Ground
4,4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,Melbourne Cricket Ground


In [9]:
df.tail()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
63883,121,964,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
63884,122,964,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
63885,123,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
63886,124,964,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium
63887,125,964,Sri Lanka,Australia,19.7,1,0,Colombo,R Premadasa Stadium


Chcking shape of dataset

In [10]:
df.shape

(63888, 9)

Checking if any null value present in dataset or not

In [11]:
df.isnull().sum()

Unnamed: 0          0
match_id            0
batting_team        0
bowling_team        0
ball                0
runs                0
player_dismissed    0
city                0
venue               0
dtype: int64

Checking the scores of how well matches are played in different cities

In [12]:
df['city'].value_counts()

city
Colombo          4086
Mirpur           3420
Johannesburg     3331
Dubai            2969
Auckland         2532
                 ... 
Nairobi           123
Potchefstroom     122
Dharamsala        122
Ahmedabad         121
Carrara            64
Name: count, Length: 86, dtype: int64

Considering the cities for prediction where matches played are greater than 600

In [13]:
eligible_cities = df['city'].value_counts()[df['city'].value_counts() > 600].index.tolist()

In [14]:
eligible_cities

['Colombo',
 'Mirpur',
 'Johannesburg',
 'Dubai',
 'Auckland',
 'Cape Town',
 'London',
 'Pallekele',
 'Barbados',
 'Sydney',
 'Melbourne',
 'Durban',
 'St Lucia',
 'Wellington',
 'Lauderhill',
 'Hamilton',
 'Centurion',
 'Manchester',
 'Abu Dhabi',
 'Mumbai',
 'Nottingham',
 'Southampton',
 'Mount Maunganui',
 'Chittagong',
 'Kolkata',
 'Lahore',
 'Delhi',
 'Nagpur',
 'Chandigarh',
 'Adelaide',
 'Bangalore',
 'St Kitts',
 'Cardiff',
 'Christchurch',
 'Trinidad']

Adding the new list of eligible cities into the dataset only

In [15]:
df = df[df['city'].isin(eligible_cities)]

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,Melbourne Cricket Ground
1,1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,Melbourne Cricket Ground
2,2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,Melbourne Cricket Ground
3,3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,Melbourne Cricket Ground
4,4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,Melbourne Cricket Ground


In [17]:
df.tail()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
63883,121,964,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
63884,122,964,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
63885,123,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
63886,124,964,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium
63887,125,964,Sri Lanka,Australia,19.7,1,0,Colombo,R Premadasa Stadium


In [18]:
df.shape

(50501, 9)

Now calculating current score

In [19]:
df['current_score'] = df.groupby('match_id')['runs'].cumsum()

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score
0,0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,Melbourne Cricket Ground,0
1,1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,Melbourne Cricket Ground,0
2,2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,Melbourne Cricket Ground,1
3,3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,Melbourne Cricket Ground,3
4,4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,Melbourne Cricket Ground,3


In [21]:
df.tail()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score
63883,121,964,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium,125
63884,122,964,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium,125
63885,123,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium,125
63886,124,964,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium,127
63887,125,964,Sri Lanka,Australia,19.7,1,0,Colombo,R Premadasa Stadium,128


Calculating the over number and ball number

In [22]:
df['over'] = df['ball'].apply(lambda x : str(x).split(".")[0])
df['ball_no'] = df['ball'].apply(lambda x : str(x).split(".")[1])

In [23]:
df.head()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score,over,ball_no
0,0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,Melbourne Cricket Ground,0,0,1
1,1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,Melbourne Cricket Ground,0,0,2
2,2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,Melbourne Cricket Ground,1,0,3
3,3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,Melbourne Cricket Ground,3,0,4
4,4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,Melbourne Cricket Ground,3,0,5


Calculating no of ball bowled

In [24]:
df['ball_bowled'] = (df['over'].astype(int)*6 + df['ball_no'].astype(int))

In [25]:
df.head()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score,over,ball_no,ball_bowled
0,0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,Melbourne Cricket Ground,0,0,1,1
1,1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,Melbourne Cricket Ground,0,0,2,2
2,2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,Melbourne Cricket Ground,1,0,3,3
3,3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,Melbourne Cricket Ground,3,0,4,4
4,4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,Melbourne Cricket Ground,3,0,5,5


In [26]:
df.tail()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score,over,ball_no,ball_bowled
63883,121,964,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium,125,19,3,117
63884,122,964,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium,125,19,4,118
63885,123,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium,125,19,5,119
63886,124,964,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium,127,19,6,120
63887,125,964,Sri Lanka,Australia,19.7,1,0,Colombo,R Premadasa Stadium,128,19,7,121


Handling no balls

In [27]:
df['balls_left'] = 120 - df['ball_bowled']

In [28]:
df.tail()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score,over,ball_no,ball_bowled,balls_left
63883,121,964,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium,125,19,3,117,3
63884,122,964,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium,125,19,4,118,2
63885,123,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium,125,19,5,119,1
63886,124,964,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium,127,19,6,120,0
63887,125,964,Sri Lanka,Australia,19.7,1,0,Colombo,R Premadasa Stadium,128,19,7,121,-1


In [29]:
df['balls_left'] = df['balls_left'].apply(lambda x:0 if x<0 else x)

In [30]:
df.tail()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score,over,ball_no,ball_bowled,balls_left
63883,121,964,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium,125,19,3,117,3
63884,122,964,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium,125,19,4,118,2
63885,123,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium,125,19,5,119,1
63886,124,964,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium,127,19,6,120,0
63887,125,964,Sri Lanka,Australia,19.7,1,0,Colombo,R Premadasa Stadium,128,19,7,121,0


Finding no of players dismissed after certain over for calculating wickets left

In [31]:
df['player_dismissed'] = df['player_dismissed'].apply(lambda x:1 if x!='0' else '0')

Converting player dismissed into int from string

In [32]:
df['player_dismissed'] = df['player_dismissed'].astype(int)

In [33]:
df['player_dismissed'] = df.groupby('match_id')['player_dismissed'].cumsum()

Calculating wickets left

In [34]:
df['wicket_left'] = 10 -df['player_dismissed']

In [35]:
df

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score,over,ball_no,ball_bowled,balls_left,wicket_left
0,0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,Melbourne Cricket Ground,0,0,1,1,119,10
1,1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,Melbourne Cricket Ground,0,0,2,2,118,10
2,2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,Melbourne Cricket Ground,1,0,3,3,117,10
3,3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,Melbourne Cricket Ground,3,0,4,4,116,10
4,4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,Melbourne Cricket Ground,3,0,5,5,115,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63883,121,964,Sri Lanka,Australia,19.3,1,8,Colombo,R Premadasa Stadium,125,19,3,117,3,2
63884,122,964,Sri Lanka,Australia,19.4,0,8,Colombo,R Premadasa Stadium,125,19,4,118,2,2
63885,123,964,Sri Lanka,Australia,19.5,0,9,Colombo,R Premadasa Stadium,125,19,5,119,1,1
63886,124,964,Sri Lanka,Australia,19.6,2,9,Colombo,R Premadasa Stadium,127,19,6,120,0,1


Finding current run rate

In [36]:
df['current_run_rate'] = (df['current_score']*6) / df['ball_bowled']

In [37]:
df.tail()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score,over,ball_no,ball_bowled,balls_left,wicket_left,current_run_rate
63883,121,964,Sri Lanka,Australia,19.3,1,8,Colombo,R Premadasa Stadium,125,19,3,117,3,2,6.410256
63884,122,964,Sri Lanka,Australia,19.4,0,8,Colombo,R Premadasa Stadium,125,19,4,118,2,2,6.355932
63885,123,964,Sri Lanka,Australia,19.5,0,9,Colombo,R Premadasa Stadium,125,19,5,119,1,1,6.302521
63886,124,964,Sri Lanka,Australia,19.6,2,9,Colombo,R Premadasa Stadium,127,19,6,120,0,1,6.35
63887,125,964,Sri Lanka,Australia,19.7,1,9,Colombo,R Premadasa Stadium,128,19,7,121,0,1,6.347107


Calulating last six

In [38]:
groups = df.groupby('match_id')
match_id = df['match_id'].unique()
last_six=[]
for id in match_id:
  last_six.extend(groups.get_group(id).rolling(window = 36)['runs'].sum().values.tolist())

In [39]:
df['last_six'] = last_six

In [40]:
df.head()

Unnamed: 0.1,Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue,current_score,over,ball_no,ball_bowled,balls_left,wicket_left,current_run_rate,last_six
0,0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,Melbourne Cricket Ground,0,0,1,1,119,10,0.0,
1,1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,Melbourne Cricket Ground,0,0,2,2,118,10,0.0,
2,2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,Melbourne Cricket Ground,1,0,3,3,117,10,2.0,
3,3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,Melbourne Cricket Ground,3,0,4,4,116,10,4.5,
4,4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,Melbourne Cricket Ground,3,0,5,5,115,10,3.6,


In [41]:
last_six

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 47.0,
 47.0,
 48.0,
 48.0,
 46.0,
 47.0,
 45.0,
 47.0,
 46.0,
 47.0,
 48.0,
 45.0,
 47.0,
 47.0,
 47.0,
 50.0,
 51.0,
 47.0,
 47.0,
 48.0,
 48.0,
 47.0,
 46.0,
 42.0,
 47.0,
 45.0,
 42.0,
 39.0,
 35.0,
 36.0,
 35.0,
 34.0,
 39.0,
 41.0,
 41.0,
 41.0,
 41.0,
 42.0,
 41.0,
 41.0,
 42.0,
 41.0,
 41.0,
 40.0,
 40.0,
 40.0,
 40.0,
 41.0,
 38.0,
 38.0,
 38.0,
 36.0,
 38.0,
 39.0,
 45.0,
 47.0,
 48.0,
 48.0,
 49.0,
 51.0,
 46.0,
 47.0,
 48.0,
 53.0,
 55.0,
 57.0,
 58.0,
 59.0,
 54.0,
 53.0,
 54.0,
 55.0,
 55.0,
 55.0,
 56.0,
 56.0,
 57.0,
 59.0,
 59.0,
 60.0,
 60.0,
 59.0,
 59.0,
 61.0,
 60.0,
 61.0,
 61.0,
 61.0,
 60.0,
 63.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan

In [42]:
final_df = df.groupby('match_id').sum()['runs'].reset_index().merge(df, on='match_id')

In [43]:
final_df

Unnamed: 0.1,match_id,runs_x,Unnamed: 0,batting_team,bowling_team,ball,runs_y,player_dismissed,city,venue,current_score,over,ball_no,ball_bowled,balls_left,wicket_left,current_run_rate,last_six
0,2,168,0,Australia,Sri Lanka,0.1,0,0,Melbourne,Melbourne Cricket Ground,0,0,1,1,119,10,0.000000,
1,2,168,1,Australia,Sri Lanka,0.2,0,0,Melbourne,Melbourne Cricket Ground,0,0,2,2,118,10,0.000000,
2,2,168,2,Australia,Sri Lanka,0.3,1,0,Melbourne,Melbourne Cricket Ground,1,0,3,3,117,10,2.000000,
3,2,168,3,Australia,Sri Lanka,0.4,2,0,Melbourne,Melbourne Cricket Ground,3,0,4,4,116,10,4.500000,
4,2,168,4,Australia,Sri Lanka,0.5,0,0,Melbourne,Melbourne Cricket Ground,3,0,5,5,115,10,3.600000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50496,964,128,121,Sri Lanka,Australia,19.3,1,8,Colombo,R Premadasa Stadium,125,19,3,117,3,2,6.410256,39.0
50497,964,128,122,Sri Lanka,Australia,19.4,0,8,Colombo,R Premadasa Stadium,125,19,4,118,2,2,6.355932,39.0
50498,964,128,123,Sri Lanka,Australia,19.5,0,9,Colombo,R Premadasa Stadium,125,19,5,119,1,1,6.302521,38.0
50499,964,128,124,Sri Lanka,Australia,19.6,2,9,Colombo,R Premadasa Stadium,127,19,6,120,0,1,6.350000,39.0


In [44]:
final_df.columns

Index(['match_id', 'runs_x', 'Unnamed: 0', 'batting_team', 'bowling_team',
       'ball', 'runs_y', 'player_dismissed', 'city', 'venue', 'current_score',
       'over', 'ball_no', 'ball_bowled', 'balls_left', 'wicket_left',
       'current_run_rate', 'last_six'],
      dtype='object')

In [45]:
final_df = final_df[['batting_team', 'bowling_team', 'city', 'current_score', 'balls_left', 'wicket_left', 'current_run_rate', 'last_six', 'runs_x']]

Removing Nan value from dataset

In [46]:
final_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.dropna(inplace=True)


In [47]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wicket_left,current_run_rate,last_six,runs_x
35,Australia,Sri Lanka,Melbourne,47,84,10,7.833333,47.0,168
36,Australia,Sri Lanka,Melbourne,47,83,10,7.621622,47.0,168
37,Australia,Sri Lanka,Melbourne,48,83,10,7.783784,48.0,168
38,Australia,Sri Lanka,Melbourne,49,82,10,7.736842,48.0,168
39,Australia,Sri Lanka,Melbourne,49,81,10,7.538462,46.0,168
...,...,...,...,...,...,...,...,...,...
50496,Sri Lanka,Australia,Colombo,125,3,2,6.410256,39.0,128
50497,Sri Lanka,Australia,Colombo,125,2,2,6.355932,39.0,128
50498,Sri Lanka,Australia,Colombo,125,1,1,6.302521,38.0,128
50499,Sri Lanka,Australia,Colombo,127,0,1,6.350000,39.0,128


In [48]:
final_df.isnull().sum()

batting_team        0
bowling_team        0
city                0
current_score       0
balls_left          0
wicket_left         0
current_run_rate    0
last_six            0
runs_x              0
dtype: int64

In [49]:
final_df.shape

(35999, 9)

Dividing datset into dependent and independent features

In [50]:
X = final_df.drop(columns=['runs_x'])
y = final_df['runs_x']

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
X_train

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wicket_left,current_run_rate,last_six
31438,New Zealand,Bangladesh,Pallekele,108,43,9,8.415584,63.0
36893,India,Pakistan,Bangalore,102,38,8,7.463415,50.0
49166,New Zealand,England,Delhi,132,25,7,8.336842,42.0
35087,India,England,Mumbai,92,49,6,7.774648,32.0
25264,West Indies,India,Barbados,65,62,10,6.724138,44.0
...,...,...,...,...,...,...,...,...
23656,Australia,Pakistan,Melbourne,68,67,7,7.698113,45.0
8785,Pakistan,New Zealand,Abu Dhabi,89,41,7,6.759494,49.0
15850,India,Pakistan,Durban,68,55,6,6.276923,40.0
1210,West Indies,India,Lauderhill,205,25,7,12.947368,67.0


In [54]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

In [55]:
transformer = ColumnTransformer([
    ('transformer', OneHotEncoder(sparse_output=False, drop='first'),['batting_team','bowling_team','city'])],remainder='passthrough')

In [56]:
pipe = Pipeline(steps=[
    ('step1', transformer),
    ('step2', StandardScaler()),
    ('step3', XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, random_state=1))
])

In [57]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)

In [58]:
r2_score(y_test, y_pred)

0.9886222388952028

In [59]:
mean_absolute_error(y_test, y_pred)

1.524569467968411

In [60]:
import pickle
pickle.dump(pipe,open('pipe.pkl', 'wb') )