In [59]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
df = pd.read_csv('match_summary.csv',index_col=0)
pd.set_option('display.max_columns', None)

In [61]:
df = df.reset_index(drop=True)

In [62]:
df.head(40)

Unnamed: 0,venue_x,innings,batting_team,bowling_team,total_runs_per_inning_match,Total_Overs_Played
0,"Narendra Modi Stadium, Ahmedabad",1,England,New Zealand,282,50.0
1,"Narendra Modi Stadium, Ahmedabad",2,New Zealand,England,283,36.333333
2,"Rajiv Gandhi International Stadium, Uppal, Hyd...",1,Pakistan,Netherlands,286,48.833333
3,"Rajiv Gandhi International Stadium, Uppal, Hyd...",2,Netherlands,Pakistan,205,41.0
4,"Himachal Pradesh Cricket Association Stadium, ...",1,Afghanistan,Bangladesh,156,37.333333
5,"Himachal Pradesh Cricket Association Stadium, ...",2,Bangladesh,Afghanistan,158,34.666667
6,"Arun Jaitley Stadium, Delhi",1,South Africa,Sri Lanka,428,50.0
7,"Arun Jaitley Stadium, Delhi",2,Sri Lanka,South Africa,326,44.833333
8,"MA Chidambaram Stadium, Chepauk, Chennai",1,Australia,India,199,49.5
9,"MA Chidambaram Stadium, Chepauk, Chennai",2,India,Australia,201,41.333333


In [63]:
df.shape

(64, 6)

In [64]:
df.columns

Index(['venue_x', 'innings', 'batting_team', 'bowling_team',
       'total_runs_per_inning_match', 'Total_Overs_Played'],
      dtype='object')

In [65]:
df.dtypes

venue_x                         object
innings                          int64
batting_team                    object
bowling_team                    object
total_runs_per_inning_match      int64
Total_Overs_Played             float64
dtype: object

In [66]:
df.describe()

Unnamed: 0,innings,total_runs_per_inning_match,Total_Overs_Played
count,64.0,64.0,64.0
mean,1.5,255.234375,44.028646
std,0.503953,75.34571,7.403736
min,1.0,90.0,21.0
25%,1.0,204.75,40.958333
50%,1.5,252.0,47.0
75%,2.0,286.5,50.0
max,2.0,428.0,50.0


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   venue_x                      64 non-null     object 
 1   innings                      64 non-null     int64  
 2   batting_team                 64 non-null     object 
 3   bowling_team                 64 non-null     object 
 4   total_runs_per_inning_match  64 non-null     int64  
 5   Total_Overs_Played           64 non-null     float64
dtypes: float64(1), int64(2), object(3)
memory usage: 3.1+ KB


In [68]:
df.isnull().sum()

venue_x                        0
innings                        0
batting_team                   0
bowling_team                   0
total_runs_per_inning_match    0
Total_Overs_Played             0
dtype: int64

In [69]:
df.nunique()

venue_x                        10
innings                         2
batting_team                   10
bowling_team                   10
total_runs_per_inning_match    55
Total_Overs_Played             38
dtype: int64

In [70]:
cat = [col for col in df if df[col].dtype == 'object']
cat

['venue_x', 'batting_team', 'bowling_team']

In [71]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [72]:
mapping = {}

for column in cat:
    df[column] = le.fit_transform(df[column])
    mapping[column] = dict(zip(le.classes_, le.transform(le.classes_)))

In [73]:
for column_name in cat:
    print(f'Mapping for column "{column_name}":')
    for key, value in mapping[column_name].items():
        print(f'{key}: {value}')
    print()


Mapping for column "venue_x":
Arun Jaitley Stadium, Delhi: 0
Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow: 1
Eden Gardens, Kolkata: 2
Himachal Pradesh Cricket Association Stadium, Dharamsala: 3
M Chinnaswamy Stadium, Bengaluru: 4
MA Chidambaram Stadium, Chepauk, Chennai: 5
Maharashtra Cricket Association Stadium, Pune: 6
Narendra Modi Stadium, Ahmedabad: 7
Rajiv Gandhi International Stadium, Uppal, Hyderabad: 8
Wankhede Stadium, Mumbai: 9

Mapping for column "batting_team":
Afghanistan: 0
Australia: 1
Bangladesh: 2
England: 3
India: 4
Netherlands: 5
New Zealand: 6
Pakistan: 7
South Africa: 8
Sri Lanka: 9

Mapping for column "bowling_team":
Afghanistan: 0
Australia: 1
Bangladesh: 2
England: 3
India: 4
Netherlands: 5
New Zealand: 6
Pakistan: 7
South Africa: 8
Sri Lanka: 9



In [74]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Models-------------------------
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor


In [75]:
X = df[['batting_team','bowling_team','venue_x','total_runs_per_inning_match','innings']]
y = df['Total_Overs_Played']

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42,test_size=0.20, shuffle=True)

In [77]:
print(X_test)

    batting_team  bowling_team  venue_x  total_runs_per_inning_match  innings
52             1             6        3                          388        1
58             9             0        6                          241        1
0              3             6        7                          282        1
44             8             2        9                          382        1
5              2             0        3                          158        2
36             8             3        9                          399        1
16             0             4        0                          272        1
12             3             2        3                          364        1
25             1             9        1                          215        2
61             7             2        2                          205        2
56             4             3        1                          229        1
9              4             1        5                         

In [78]:
size_scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = size_scaler.transform(X_train)
X_test_scaled = size_scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((51, 5), (13, 5))

In [79]:
model_dict = {
    'LinearRegression': {"model": LinearRegression(), "params": {}},
    'RandomForestRegressor': {"model": RandomForestRegressor(random_state=42),
                     "params": {'n_estimators': list(range(5, 50, 5)), 'max_depth': list(range(1, 10, 2))}},
    # 'XGBRegressor': {"model": XGBRegressor(), "params": {'n_estimators': list(range(10, 800, 100)), 'learning_rate': [0.001, 0.01, 0.1]}},
    'PolynomialFeatures': {"model": make_pipeline(PolynomialFeatures(), LinearRegression()),
                      "params": {'polynomialfeatures__degree': [2,3]}}
}


In [80]:
def eval_models():
    model_results = pd.DataFrame()
    model_results['Train_RMSE'] = None
    model_results['Test_RMSE'] = None
    model_results['Train_MAE'] = None
    model_results['Test_MAE'] = None
    model_results['best_params'] = None
    best_test_score = math.inf

    for model_name, reg_model in model_dict.items():
        classifier = GridSearchCV(reg_model['model'], reg_model['params'], n_jobs=20, verbose=0)
        classifier.fit(X_train_scaled, y_train)
        best_model = classifier.best_estimator_

        y_train_predicted = best_model.predict(X_train_scaled)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_predicted))
        train_mae = mean_absolute_error(y_train, y_train_predicted)

        print(model_name, train_rmse, classifier.best_params_)

        y_predicted = best_model.predict(X_test_scaled)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
        test_mae = mean_absolute_error(y_test, y_predicted)

        if test_rmse < best_test_score:
            best_test_score = test_rmse
            best_reg_model_ours = best_model

        model_results.loc[model_name, ['Train_RMSE', 'Test_RMSE', 'Train_MAE', 'Test_MAE', 'best_params']] = [train_rmse, test_rmse, train_mae, test_mae, classifier.best_params_]

    print("Best model: ", best_reg_model_ours)
    y_predicted = best_reg_model_ours.predict(X_test_scaled)

    return model_results,best_reg_model_ours

In [81]:
model_results,best_reg_model_ours = eval_models()
model_results

LinearRegression 4.896128197935495 {}
RandomForestRegressor 2.9471541416381206 {'max_depth': 3, 'n_estimators': 35}
PolynomialFeatures 3.323795616983923 {'polynomialfeatures__degree': 2}
Best model:  LinearRegression()


Unnamed: 0,Train_RMSE,Test_RMSE,Train_MAE,Test_MAE,best_params
LinearRegression,4.896128,4.332698,3.761022,3.945144,{}
RandomForestRegressor,2.947154,4.530194,2.281807,3.149911,"{'max_depth': 3, 'n_estimators': 35}"
PolynomialFeatures,3.323796,4.365824,2.577271,3.288372,{'polynomialfeatures__degree': 2}


In [82]:
print(best_reg_model_ours)

LinearRegression()


In [83]:
y_predicted = best_reg_model_ours.predict(X_test_scaled)

In [84]:
print(y_predicted)

[56.33571365 43.84282807 48.68256255 53.7655297  36.58846208 54.9029428
 48.78272655 54.04994474 40.64586993 37.9700972  44.55657181 38.85899629
 47.24799897]


In [85]:
classifier=best_reg_model_ours
classifier.fit(X_train,y_train)

In [86]:
import pickle
pickle_out = open("overs.pkl","wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()