In [1]:
#importing required librarires
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
#importing metrics libraries to calculate error, score etc.
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, accuracy_score, r2_score


In [3]:
#importing our dataset
df=pd.read_csv(r'C:\Users\mk744\OneDrive - Poornima University\Documents\IPL_SCORE_PREDICTOR\ipl.csv')
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [4]:
#checking all columns
df.columns

Index(['mid', 'date', 'venue', 'bat_team', 'bowl_team', 'batsman', 'bowler',
       'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'striker',
       'non-striker', 'total'],
      dtype='object')

In [5]:
#checking datatypes of columns
df.dtypes

mid                 int64
date               object
venue              object
bat_team           object
bowl_team          object
batsman            object
bowler             object
runs                int64
wickets             int64
overs             float64
runs_last_5         int64
wickets_last_5      int64
striker             int64
non-striker         int64
total               int64
dtype: object

In [6]:
#checking if any null value is present
df.isnull().sum()

mid               0
date              0
venue             0
bat_team          0
bowl_team         0
batsman           0
bowler            0
runs              0
wickets           0
overs             0
runs_last_5       0
wickets_last_5    0
striker           0
non-striker       0
total             0
dtype: int64

In [7]:
# mid here represents match id. A useless column
df['mid'].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [8]:
#removing certain column that has least impact on prediction.
#ps note: venue can be a deciding factor but after including venue, error was very large. So, drop it.
columns_to_remove = ['mid','venue', 'batsman', 'bowler', 'striker', 
                    'non-striker']
df.drop(labels=columns_to_remove, axis=1, inplace=True)
df.shape

(76014, 9)

In [9]:
#checking all the teams playing
df['bat_team'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Deccan Chargers', 'Kings XI Punjab',
       'Royal Challengers Bangalore', 'Delhi Daredevils',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant'], dtype=object)

In [10]:
#keeping only those teams that are currently playing
valid_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians','Kings XI Punjab',
       'Royal Challengers Bangalore', 'Delhi Daredevils','Sunrisers Hyderabad']

In [11]:
#eliminating non valid teams
df=df[(df['bat_team'].isin(valid_teams))& (df['bowl_team'].isin(valid_teams))]
df.shape

(53811, 9)

In [12]:
#lower limit of over is kept as 5.
df=df[df['overs']>=5.0]
df.shape

(40108, 9)

In [13]:
#converting datatype of datecolumn from string to <M8[ns].
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
df['date'].dtype

dtype('<M8[ns]')

In [14]:
#handling categorical features
new_df = pd.get_dummies(data=df, columns=[ 'bat_team', 'bowl_team'])
new_df.columns

Index(['date', 'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5',
       'total', 'bat_team_Chennai Super Kings', 'bat_team_Delhi Daredevils',
       'bat_team_Kings XI Punjab', 'bat_team_Kolkata Knight Riders',
       'bat_team_Mumbai Indians', 'bat_team_Rajasthan Royals',
       'bat_team_Royal Challengers Bangalore', 'bat_team_Sunrisers Hyderabad',
       'bowl_team_Chennai Super Kings', 'bowl_team_Delhi Daredevils',
       'bowl_team_Kings XI Punjab', 'bowl_team_Kolkata Knight Riders',
       'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
       'bowl_team_Royal Challengers Bangalore',
       'bowl_team_Sunrisers Hyderabad'],
      dtype='object')

In [15]:
#arranging all the columns.
new_df= new_df[['date','bat_team_Chennai Super Kings', 'bat_team_Delhi Daredevils', 'bat_team_Kings XI Punjab',
              'bat_team_Kolkata Knight Riders', 'bat_team_Mumbai Indians', 'bat_team_Rajasthan Royals',
              'bat_team_Royal Challengers Bangalore', 'bat_team_Sunrisers Hyderabad',
              'bowl_team_Chennai Super Kings', 'bowl_team_Delhi Daredevils', 'bowl_team_Kings XI Punjab',
              'bowl_team_Kolkata Knight Riders', 'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
              'bowl_team_Royal Challengers Bangalore', 'bowl_team_Sunrisers Hyderabad',
              'overs', 'runs', 'wickets', 'runs_last_5', 'wickets_last_5', 'total']]

In [16]:
#splitting into train and test data.
X_train=new_df.drop(labels='total', axis=1)[new_df['date'].dt.year<=2016]
X_test=new_df.drop(labels='total', axis=1)[new_df['date'].dt.year>=2017]

y_train=new_df[new_df['date'].dt.year<=2016]['total'].values
y_test=new_df[new_df['date'].dt.year>=2017]['total'].values

In [17]:
#now date is no longer required so dropping it.
X_train.drop(labels='date',axis=True,inplace=True)
X_test.drop(labels='date',axis=True,inplace=True)

In [18]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((37330, 21), (2778, 21), (37330,), (2778,))

Performing different reg

In [19]:
from sklearn.linear_model import LinearRegression
linear_regressor = LinearRegression()
linear_regressor.fit(X_train,y_train)
y_pred_lr = linear_regressor.predict(X_test)


print("---- Linear Regression - Model Evaluation ----")
print("Mean Absolute Error (MAE): {}".format(mae(y_test, y_pred_lr)))
print("Mean Squared Error (MSE): {}".format(mse(y_test, y_pred_lr)))
print("Root Mean Squared Error (RMSE): {}".format(np.sqrt(mse(y_test, y_pred_lr))))
print("R2 Score : {}".format(r2_score(y_test, y_pred_lr)))

---- Linear Regression - Model Evaluation ----
Mean Absolute Error (MAE): 12.118617546193311
Mean Squared Error (MSE): 251.00792310417512
Root Mean Squared Error (RMSE): 15.843229566732129
R2 Score : 0.752263356635052


In [20]:
from sklearn.ensemble import RandomForestRegressor
randomforest_regressor = RandomForestRegressor(100)
randomforest_regressor.fit(X_train,y_train)
y_pred_rf = randomforest_regressor.predict(X_test)


print("---- Linear Regression - Model Evaluation ----")
print("Mean Absolute Error (MAE): {}".format(mae(y_test, y_pred_rf)))
print("Mean Squared Error (MSE): {}".format(mse(y_test, y_pred_rf)))
print("Root Mean Squared Error (RMSE): {}".format(np.sqrt(mse(y_test, y_pred_rf))))
print("R2 Score : {}".format(r2_score(y_test, y_pred_rf)))

---- Linear Regression - Model Evaluation ----
Mean Absolute Error (MAE): 13.717220362371012
Mean Squared Error (MSE): 327.9980426281197
Root Mean Squared Error (RMSE): 18.110716237303254
R2 Score : 0.6762766166658429


In [21]:
from sklearn.tree import DecisionTreeRegressor
decisiontree_regressor = DecisionTreeRegressor()
decisiontree_regressor.fit(X_train,y_train)
y_pred_dt = decisiontree_regressor.predict(X_test)


print("---- Linear Regression - Model Evaluation ----")
print("Mean Absolute Error (MAE): {}".format(mae(y_test, y_pred_dt)))
print("Mean Squared Error (MSE): {}".format(mse(y_test, y_pred_dt)))
print("Root Mean Squared Error (RMSE): {}".format(np.sqrt(mse(y_test, y_pred_dt))))
print("R2 Score : {}".format(r2_score(y_test, y_pred_dt)))

---- Linear Regression - Model Evaluation ----
Mean Absolute Error (MAE): 17.029877609791217
Mean Squared Error (MSE): 527.5986321094313
Root Mean Squared Error (RMSE): 22.969515278068698
R2 Score : 0.4792773369608646


In [22]:
from sklearn.ensemble import AdaBoostRegressor
adaboost_regressor = AdaBoostRegressor(base_estimator=linear_regressor, n_estimators=100)
adaboost_regressor.fit(X_train,y_train)
y_pred_ab = adaboost_regressor.predict(X_test)


print("---- Linear Regression - Model Evaluation ----")
print("Mean Absolute Error (MAE): {}".format(mae(y_test, y_pred_ab)))
print("Mean Squared Error (MSE): {}".format(mse(y_test, y_pred_ab)))
print("Root Mean Squared Error (RMSE): {}".format(np.sqrt(mse(y_test, y_pred_ab))))
print("R2 Score : {}".format(r2_score(y_test, y_pred_ab)))

---- Linear Regression - Model Evaluation ----
Mean Absolute Error (MAE): 12.117466899484336
Mean Squared Error (MSE): 247.26273366236404
Root Mean Squared Error (RMSE): 15.724590095209606
R2 Score : 0.7559597366122491


In [23]:
import pickle

In [24]:
filename = 'model.pkl'
pickle.dump(linear_regressor, open(filename, 'wb'))