In [1]:
import pandas as pd
import numpy as np
import pickle

In [3]:
df = pickle.load(open('ODI_2.pkl', 'rb'))

In [4]:
df_1 = df[(df['batting_team'] == 'India') | (df['bowling_team'] == 'India')]

In [5]:
df_2 = df[(df['batting_team'].isin(['India', 'South Africa'])) & (df['bowling_team'].isin(['India', 'South Africa']))]

In [6]:
df_3 = df[df['venue'] == 'Boland Park']

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [8]:
X = df_2.drop(['total_runs', 'bowling_team'], axis = 1)
y = df_2['total_runs']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output = False, drop = 'first'), ['batting_team', 'venue'])
], remainder = 'passthrough')

In [11]:
pipe_lr = Pipeline([
    ('ct', ct),
    ('ss', StandardScaler()),
    ('lr', LinearRegression())
])

In [12]:
pipe_rfr = Pipeline([
    ('ct', ct),
    ('ss', StandardScaler()),
    ('rfr', RandomForestRegressor())
])

In [13]:
pipe_xgbr = Pipeline([
    ('ct', ct),
    ('ss', StandardScaler()),
    ('xgbr', XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=1))
])

In [14]:
np.mean(cross_val_score(pipe_xgbr, X, y, cv=10, scoring='r2', n_jobs=-1, verbose=2, error_score='raise'))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   42.6s finished


0.9937723053479897

In [15]:
np.mean(cross_val_score(pipe_rfr, X, y, cv=10, scoring='r2', n_jobs=-1, verbose=2, error_score='raise'))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   51.0s finished


0.9924585100843604

In [16]:
np.mean(cross_val_score(pipe_lr, X, y, cv=10, scoring='r2', n_jobs=-1, verbose=2, error_score='raise'))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


0.7781726685328335

In [17]:
pipe_xgbr.fit(X_train, y_train)

In [18]:
pipe_rfr.fit(X_train, y_train)

In [19]:
pipe_lr.fit(X_train, y_train)

In [20]:
print('Test score: ', r2_score(y_test, pipe_xgbr.predict(X_test)))
print('Train score: ', r2_score(y_train, pipe_xgbr.predict(X_train)))

Test score:  0.9929830198325387
Train score:  0.9999999978003563


In [21]:
print('Test score: ', r2_score(y_test, pipe_lr.predict(X_test)))
print('Train score: ', r2_score(y_train, pipe_lr.predict(X_train)))

Test score:  0.77356373997832
Train score:  0.780751797961274


In [22]:
print('Test score: ', r2_score(y_test, pipe_rfr.predict(X_test)))
print('Train score: ', r2_score(y_train, pipe_rfr.predict(X_train)))

Test score:  0.9895738983222703
Train score:  0.9988209371803062


In [23]:
# pickle.dump(pipe_rfr, open('pipe_rfr.pkl', 'wb'))

In [24]:
pickle.dump(pipe_xgbr, open('pipe_xgbr.pkl', 'wb'))

In [25]:
import xgboost as xgb

print(xgb.__version__)


2.0.3
