In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('match_summary.csv',index_col=0)
pd.set_option('display.max_columns', None)

In [3]:
df = df[df['innings'] == 1]

In [4]:
df = df.reset_index(drop=True)

In [5]:
df.head(40)

Unnamed: 0,venue_x,innings,batting_team,bowling_team,total_runs_per_inning_match,Total_Overs_Played
0,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,282,50.0
1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",1,Pakistan,Netherlands,286,48.833333
2,"Himachal Pradesh Cricket Association Stadium, ...",1,Afghanistan,Bangladesh,156,37.333333
3,"Arun Jaitley Stadium, Delhi",1,South Africa,Sri Lanka,428,50.0
4,"MA Chidambaram Stadium, Chepauk, Chennai",1,Australia,India,199,49.5
5,"Rajiv Gandhi International Stadium, Uppal, Hyd...",1,New Zealand,Netherlands,322,50.0
6,"Himachal Pradesh Cricket Association Stadium, ...",1,England,Bangladesh,364,50.0
7,"Rajiv Gandhi International Stadium, Uppal, Hyd...",1,Sri Lanka,Pakistan,344,50.0
8,"Arun Jaitley Stadium, Delhi",1,Afghanistan,India,272,50.0
9,"MA Chidambaram Stadium, Chepauk, Chennai",1,Bangladesh,New Zealand,245,50.0


In [6]:
df = df.drop(['innings'],axis='columns')

In [7]:
df.shape

(32, 5)

In [8]:
df.columns

Index(['venue_x', 'batting_team', 'bowling_team',
       'total_runs_per_inning_match', 'Total_Overs_Played'],
      dtype='object')

In [9]:
df.dtypes

venue_x                         object
batting_team                    object
bowling_team                    object
total_runs_per_inning_match      int64
Total_Overs_Played             float64
dtype: object

In [10]:
df.describe()

Unnamed: 0,total_runs_per_inning_match,Total_Overs_Played
count,32.0,32.0
mean,285.0,48.078125
std,73.028938,4.014308
min,156.0,33.333333
25%,238.0,49.208333
50%,277.5,50.0
75%,347.25,50.0
max,428.0,50.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   venue_x                      32 non-null     object 
 1   batting_team                 32 non-null     object 
 2   bowling_team                 32 non-null     object 
 3   total_runs_per_inning_match  32 non-null     int64  
 4   Total_Overs_Played           32 non-null     float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.4+ KB


In [12]:
df.isnull().sum()

venue_x                        0
batting_team                   0
bowling_team                   0
total_runs_per_inning_match    0
Total_Overs_Played             0
dtype: int64

In [13]:
df.nunique()

venue_x                        10
batting_team                   10
bowling_team                   10
total_runs_per_inning_match    27
Total_Overs_Played             13
dtype: int64

In [14]:
cat = [col for col in df if df[col].dtype == 'object']
cat

['venue_x', 'batting_team', 'bowling_team']

In [15]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [16]:
mapping = {}

for column in cat:
    df[column] = le.fit_transform(df[column])
    mapping[column] = dict(zip(le.classes_, le.transform(le.classes_)))

In [17]:
for column_name in cat:
    print(f'Mapping for column "{column_name}":')
    for key, value in mapping[column_name].items():
        print(f'{key}: {value}')
    print()


Mapping for column "venue_x":
Arun Jaitley Stadium, Delhi: 0
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow: 1
Eden Gardens, Kolkata: 2
Himachal Pradesh Cricket Association Stadium, Dharamsala: 3
M Chinnaswamy Stadium, Bengaluru: 4
MA Chidambaram Stadium, Chepauk, Chennai: 5
Maharashtra Cricket Association Stadium, Pune: 6
Narendra Modi Stadium, Ahmedabad: 7
Rajiv Gandhi International Stadium, Uppal, Hyderabad: 8
Wankhede Stadium, Mumbai: 9

Mapping for column "batting_team":
Afghanistan: 0
Australia: 1
Bangladesh: 2
England: 3
India: 4
Netherlands: 5
New Zealand: 6
Pakistan: 7
South Africa: 8
Sri Lanka: 9

Mapping for column "bowling_team":
Afghanistan: 0
Australia: 1
Bangladesh: 2
England: 3
India: 4
Netherlands: 5
New Zealand: 6
Pakistan: 7
South Africa: 8
Sri Lanka: 9



In [18]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Models-------------------------
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor


In [19]:
X = df[['batting_team','bowling_team','venue_x','Total_Overs_Played']]
y = df['total_runs_per_inning_match']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.20, shuffle=True)

In [21]:
size_scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = size_scaler.transform(X_train)
X_test_scaled = size_scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((25, 4), (7, 4))

In [22]:
model_dict = {
    'LinearRegression': {"model": LinearRegression(), "params": {}},
    'RandomForestRegressor': {"model": RandomForestRegressor(random_state=42),
                     "params": {'n_estimators': list(range(5, 50, 5)), 'max_depth': list(range(1, 10, 2))}},
    'XGBRegressor': {"model": XGBRegressor(), "params": {'n_estimators': list(range(10, 800, 100)), 'learning_rate': [0.001, 0.01, 0.1]}},
    'PolynomialFeatures': {"model": make_pipeline(PolynomialFeatures(), LinearRegression()),
                      "params": {'polynomialfeatures__degree': [2,3]}}
}


In [23]:
def eval_models():
    model_results = pd.DataFrame()
    model_results['Train_RMSE'] = None
    model_results['Test_RMSE'] = None
    model_results['Train_MAE'] = None
    model_results['Test_MAE'] = None
    model_results['best_params'] = None
    best_test_score = math.inf

    for model_name, reg_model in model_dict.items():
        classifier = GridSearchCV(reg_model['model'], reg_model['params'], n_jobs=20, verbose=0)
        classifier.fit(X_train_scaled, y_train)
        best_model = classifier.best_estimator_

        y_train_predicted = best_model.predict(X_train_scaled)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_predicted))
        train_mae = mean_absolute_error(y_train, y_train_predicted)

        print(model_name, train_rmse, classifier.best_params_)

        y_predicted = best_model.predict(X_test_scaled)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
        test_mae = mean_absolute_error(y_test, y_predicted)

        if test_rmse < best_test_score:
            best_test_score = test_rmse
            best_reg_model_ours = best_model

        model_results.loc[model_name, ['Train_RMSE', 'Test_RMSE', 'Train_MAE', 'Test_MAE', 'best_params']] = [train_rmse, test_rmse, train_mae, test_mae, classifier.best_params_]

    print("Best model: ", best_reg_model_ours)
    y_predicted = best_reg_model_ours.predict(X_test_scaled)

    return model_results,best_reg_model_ours

In [24]:
model_results,best_reg_model_ours = eval_models()
model_results

LinearRegression 51.29764300330085 {}
RandomForestRegressor 21.535989590179113 {'max_depth': 7, 'n_estimators': 35}
XGBRegressor 58.681108654677516 {'learning_rate': 0.001, 'n_estimators': 310}
PolynomialFeatures 1.3888884180767737e-13 {'polynomialfeatures__degree': 3}
Best model:  RandomForestRegressor(max_depth=7, n_estimators=35, random_state=42)


Unnamed: 0,Train_RMSE,Test_RMSE,Train_MAE,Test_MAE,best_params
LinearRegression,51.297643,64.807838,41.669887,60.286137,{}
RandomForestRegressor,21.53599,63.936788,17.000707,59.965986,"{'max_depth': 7, 'n_estimators': 35}"
XGBRegressor,58.681109,71.254405,49.275499,62.52594,"{'learning_rate': 0.001, 'n_estimators': 310}"
PolynomialFeatures,0.0,411.256903,0.0,268.943187,{'polynomialfeatures__degree': 3}


In [25]:
print(best_reg_model_ours)

RandomForestRegressor(max_depth=7, n_estimators=35, random_state=42)


In [26]:
y_predicted = best_reg_model_ours.predict(X_test_scaled)

In [27]:
print(y_predicted)

[276.68571429 307.65714286 243.42857143 302.6        309.71428571
 334.19047619 281.48571429]


In [28]:
classifier=best_reg_model_ours
classifier.fit(X_train,y_train)

In [29]:
import pickle
pickle_out = open("runs_inning_1.pkl","wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()