In [53]:
import pandas as pd
import numpy as np
import pickle

In [11]:
df = pickle.load(open('ODI_2.pkl', 'rb'))

In [42]:
df_1 = df[(df['batting_team'] == 'India') | (df['bowling_team'] == 'India')]

In [43]:
df_2 = df[(df['batting_team'].isin(['India', 'South Africa'])) & (df['bowling_team'].isin(['India', 'South Africa']))]

In [44]:
df_3 = df[df['venue'] == 'Boland Park']

In [45]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [61]:
X = df_2.drop('total_runs', axis = 1)
y = df_2['total_runs']

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [48]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output = False, drop = 'first'), ['batting_team', 'venue', 'bowling_team'])
], remainder = 'passthrough')

In [49]:
pipe_lr = Pipeline([
    ('ct', ct),
    ('ss', StandardScaler()),
    ('lr', LinearRegression())
])

In [50]:
pipe_rfr = Pipeline([
    ('ct', ct),
    ('ss', StandardScaler()),
    ('rfr', RandomForestRegressor())
])

In [51]:
pipe_xgbr = Pipeline([
    ('ct', ct),
    ('ss', StandardScaler()),
    ('xgbr', XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=1))
])

In [54]:
np.mean(cross_val_score(pipe_xgbr, X, y, cv=10, scoring='r2', n_jobs=-1, verbose=2, error_score='raise'))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   31.0s finished


0.9937723053479897

In [70]:
np.mean(cross_val_score(pipe_rfr, X, y, cv=10, scoring='r2', n_jobs=-1, verbose=2, error_score='raise'))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   24.8s finished


0.9921580560812047

In [69]:
np.mean(cross_val_score(pipe_lr, X, y, cv=10, scoring='r2', n_jobs=-1, verbose=2, error_score='raise'))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.4s finished


0.778169561695232

In [65]:
pipe_xgbr.fit(X_train, y_train)

In [71]:
pipe_rfr.fit(X_train, y_train)

In [72]:
pipe_lr.fit(X_train, y_train)

In [66]:
print('Test score: ', r2_score(y_test, pipe_xgbr.predict(X_test)))
print('Train score: ', r2_score(y_train, pipe_xgbr.predict(X_train)))

Test score:  0.9929830198325387
Train score:  0.9999999978003563


In [73]:
print('Test score: ', r2_score(y_test, pipe_lr.predict(X_test)))
print('Train score: ', r2_score(y_train, pipe_lr.predict(X_train)))

Test score:  0.7735600448011755
Train score:  0.7807233639734903


In [74]:
print('Test score: ', r2_score(y_test, pipe_rfr.predict(X_test)))
print('Train score: ', r2_score(y_train, pipe_rfr.predict(X_train)))

Test score:  0.9900265186923436
Train score:  0.9986657998213017
