In [29]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
df = pd.read_csv('match_summary.csv',index_col=0)
pd.set_option('display.max_columns', None)

In [31]:
df = df[df['innings'] == 2]

In [32]:
df = df.reset_index(drop=True)

In [33]:
df.head(40)

Unnamed: 0,venue_x,innings,batting_team,bowling_team,total_runs_per_inning_match,Total_Overs_Played
0,"Narendra Modi Stadium, Ahmedabad",2,New Zealand,England,283,36.333333
1,"Rajiv Gandhi International Stadium, Uppal, Hyd...",2,Netherlands,Pakistan,205,41.0
2,"Himachal Pradesh Cricket Association Stadium, ...",2,Bangladesh,Afghanistan,158,34.666667
3,"Arun Jaitley Stadium, Delhi",2,Sri Lanka,South Africa,326,44.833333
4,"MA Chidambaram Stadium, Chepauk, Chennai",2,India,Australia,201,41.333333
5,"Rajiv Gandhi International Stadium, Uppal, Hyd...",2,Netherlands,New Zealand,223,46.5
6,"Himachal Pradesh Cricket Association Stadium, ...",2,Bangladesh,England,227,48.333333
7,"Rajiv Gandhi International Stadium, Uppal, Hyd...",2,Pakistan,Sri Lanka,345,48.333333
8,"Arun Jaitley Stadium, Delhi",2,India,Afghanistan,273,35.0
9,"MA Chidambaram Stadium, Chepauk, Chennai",2,New Zealand,Bangladesh,248,42.833333


In [34]:
df = df.drop(['innings'],axis='columns')

In [35]:
df.shape

(32, 5)

In [36]:
df.columns

Index(['venue_x', 'batting_team', 'bowling_team',
       'total_runs_per_inning_match', 'Total_Overs_Played'],
      dtype='object')

In [37]:
df.dtypes

venue_x                         object
batting_team                    object
bowling_team                    object
total_runs_per_inning_match      int64
Total_Overs_Played             float64
dtype: object

In [38]:
df.describe()

Unnamed: 0,total_runs_per_inning_match,Total_Overs_Played
count,32.0,32.0
mean,225.46875,39.979167
std,66.14158,7.837708
min,90.0,21.0
25%,175.25,34.958333
50%,219.0,41.416667
75%,271.5,46.541667
max,383.0,50.0


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   venue_x                      32 non-null     object 
 1   batting_team                 32 non-null     object 
 2   bowling_team                 32 non-null     object 
 3   total_runs_per_inning_match  32 non-null     int64  
 4   Total_Overs_Played           32 non-null     float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.4+ KB


In [40]:
df.isnull().sum()

venue_x                        0
batting_team                   0
bowling_team                   0
total_runs_per_inning_match    0
Total_Overs_Played             0
dtype: int64

In [41]:
df.nunique()

venue_x                        10
batting_team                   10
bowling_team                   10
total_runs_per_inning_match    30
Total_Overs_Played             28
dtype: int64

In [42]:
cat = [col for col in df if df[col].dtype == 'object']
cat

['venue_x', 'batting_team', 'bowling_team']

In [43]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [44]:
mapping = {}

for column in cat:
    df[column] = le.fit_transform(df[column])
    mapping[column] = dict(zip(le.classes_, le.transform(le.classes_)))

In [45]:
for column_name in cat:
    print(f'Mapping for column "{column_name}":')
    for key, value in mapping[column_name].items():
        print(f'{key}: {value}')
    print()


Mapping for column "venue_x":
Arun Jaitley Stadium, Delhi: 0
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow: 1
Eden Gardens, Kolkata: 2
Himachal Pradesh Cricket Association Stadium, Dharamsala: 3
M Chinnaswamy Stadium, Bengaluru: 4
MA Chidambaram Stadium, Chepauk, Chennai: 5
Maharashtra Cricket Association Stadium, Pune: 6
Narendra Modi Stadium, Ahmedabad: 7
Rajiv Gandhi International Stadium, Uppal, Hyderabad: 8
Wankhede Stadium, Mumbai: 9

Mapping for column "batting_team":
Afghanistan: 0
Australia: 1
Bangladesh: 2
England: 3
India: 4
Netherlands: 5
New Zealand: 6
Pakistan: 7
South Africa: 8
Sri Lanka: 9

Mapping for column "bowling_team":
Afghanistan: 0
Australia: 1
Bangladesh: 2
England: 3
India: 4
Netherlands: 5
New Zealand: 6
Pakistan: 7
South Africa: 8
Sri Lanka: 9



In [46]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Models-------------------------
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor


In [47]:
X = df[['batting_team','bowling_team','venue_x','Total_Overs_Played']]
y = df['total_runs_per_inning_match']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, shuffle=True)

In [49]:
print(X_test)

    batting_team  bowling_team  venue_x  Total_Overs_Played
29             0             9        6           45.333333
15             4             2        6           41.500000
24             9             3        4           25.666667
17             9             5        1           48.333333
8              4             0        0           35.000000
9              6             2        5           42.833333
30             7             2        2           32.500000


In [50]:
size_scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = size_scaler.transform(X_train)
X_test_scaled = size_scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((25, 4), (7, 4))

In [51]:
model_dict = {
    'LinearRegression': {"model": LinearRegression(), "params": {}},
    'RandomForestRegressor': {"model": RandomForestRegressor(random_state=42),
                     "params": {'n_estimators': list(range(5, 50, 5)), 'max_depth': list(range(1, 10, 2))}},
    # 'XGBRegressor': {"model": XGBRegressor(), "params": {'n_estimators': list(range(10, 800, 100)), 'learning_rate': [0.001, 0.01, 0.1]}},
    'PolynomialFeatures': {"model": make_pipeline(PolynomialFeatures(), LinearRegression()),
                      "params": {'polynomialfeatures__degree': [2,3]}}
}


In [52]:
def eval_models():
    model_results = pd.DataFrame()
    model_results['Train_RMSE'] = None
    model_results['Test_RMSE'] = None
    model_results['Train_MAE'] = None
    model_results['Test_MAE'] = None
    model_results['best_params'] = None
    best_test_score = math.inf

    for model_name, reg_model in model_dict.items():
        classifier = GridSearchCV(reg_model['model'], reg_model['params'], n_jobs=20, verbose=0)
        classifier.fit(X_train_scaled, y_train)
        best_model = classifier.best_estimator_

        y_train_predicted = best_model.predict(X_train_scaled)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_predicted))
        train_mae = mean_absolute_error(y_train, y_train_predicted)

        print(model_name, train_rmse, classifier.best_params_)

        y_predicted = best_model.predict(X_test_scaled)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
        test_mae = mean_absolute_error(y_test, y_predicted)

        if test_rmse < best_test_score:
            best_test_score = test_rmse
            best_reg_model_ours = best_model

        model_results.loc[model_name, ['Train_RMSE', 'Test_RMSE', 'Train_MAE', 'Test_MAE', 'best_params']] = [train_rmse, test_rmse, train_mae, test_mae, classifier.best_params_]

    print("Best model: ", best_reg_model_ours)
    y_predicted = best_reg_model_ours.predict(X_test_scaled)

    return model_results,best_reg_model_ours

In [53]:
model_results,best_reg_model_ours = eval_models()
model_results

LinearRegression 42.52287772975356 {}
RandomForestRegressor 19.47416041012864 {'max_depth': 5, 'n_estimators': 40}
PolynomialFeatures 1.1211274638365023e-13 {'polynomialfeatures__degree': 3}
Best model:  LinearRegression()


Unnamed: 0,Train_RMSE,Test_RMSE,Train_MAE,Test_MAE,best_params
LinearRegression,42.522878,43.172494,36.836441,32.96287,{}
RandomForestRegressor,19.47416,53.016311,16.006781,38.032143,"{'max_depth': 5, 'n_estimators': 40}"
PolynomialFeatures,0.0,70.683933,0.0,61.632682,{'polynomialfeatures__degree': 3}


In [54]:
print(best_reg_model_ours)

LinearRegression()


In [55]:
y_predicted = best_reg_model_ours.predict(X_test_scaled)

In [56]:
print(y_predicted)

[217.77802516 232.2411935  182.18176531 307.78128678 178.08494556
 256.61450782 197.73330857]
