In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [46]:
df = pd.read_csv('/content/drive/MyDrive/processed_df.csv',index_col=0)
df.head()

Unnamed: 0,venue,bat_team,bowl_team,runs,wickets,overs,total,kfold
0,Barabati Stadium,Sunrisers Hyderabad,Kolkata Knight Riders,30,0,4.1,126,0
1,Kingsmead,Sunrisers Hyderabad,Mumbai Indians,46,0,5.2,168,0
2,"Punjab Cricket Association Stadium, Mohali",Sunrisers Hyderabad,Punjab Kings,111,2,14.6,175,0
3,New Wanderers Stadium,Kolkata Knight Riders,Delhi Capitals,62,6,12.4,123,0
4,Kingsmead,Delhi Capitals,Sunrisers Hyderabad,81,3,9.6,173,0


In [41]:
def data_encoding( encoding_strategy , encoding_data , encoding_columns ):
    
    if encoding_strategy == "LabelEncoding":
        print("LabelEncoding chosen")
        Encoder = LabelEncoder()
        for column in encoding_columns :
            print("column",column )
            encoding_data[ column ] = Encoder.fit_transform(tuple(encoding_data[ column ]))
        
    elif encoding_strategy == "OneHotEncoding":
        print("OneHotEncoding chosen")
        encoding_data = pd.get_dummies(encoding_data)
        
    dtypes_list =['float64','float32','int64','int32']
    encoding_data.astype( dtypes_list[0] ).dtypes
    
    return encoding_data

In [11]:
cat_cols = ['venue','bat_team','bowl_team']
encoding_strategy = ['LabelEncoding','OneHotEncoding']

encoded_df = data_encoding(encoding_strategy[0], df, cat_cols)  #try labelencoding for linear regression and lasso/ridge regression

LabelEncoding chosen
column venue
column bat_team
column bowl_team


In [12]:
encoded_df.head()

Unnamed: 0,venue,bat_team,bowl_team,runs,wickets,overs,total,kfold
0,0,8,2,30,0,4.1,126,0
1,12,8,3,46,0,5.2,168,0
2,20,8,4,111,2,14.6,175,0
3,16,2,1,62,6,12.4,123,0
4,12,1,8,81,3,9.6,173,0


In [14]:
encoded_df.dtypes

venue          int64
bat_team       int64
bowl_team      int64
runs           int64
wickets        int64
overs        float64
total          int64
kfold          int64
dtype: object

In [21]:
df.kfold.value_counts()

2    14161
4    14150
1    14149
0    14135
3    14111
Name: kfold, dtype: int64

In [36]:
#we need to scale for linear regression and lasso/ridge regression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

**LINEAR REGRESSION**

In [34]:
def run(fold):
    df_train = encoded_df[encoded_df.kfold != fold].reset_index(drop=True)
    df_valid = encoded_df[encoded_df.kfold == fold].reset_index(drop=True)

    x_train = df_train.drop("total", axis=1).values
    y_train = df_train.total.values
 
    x_valid = df_valid.drop("total", axis=1).values
    y_valid = df_valid.total.values

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)
    #print(x_train)
 
    clf = LinearRegression()
    clf.fit(x_train, y_train)
    preds = clf.predict(x_valid)
 
    error = np.sqrt(mean_squared_error(y_valid, preds))
    print(f"Fold={fold}, error={error}")

In [35]:
run(0)
run(1)
run(2)
run(3)
run(4)

Fold=0, error=20.418292576255045
Fold=1, error=20.40171629461293
Fold=2, error=20.62092332057672
Fold=3, error=20.539321823042624
Fold=4, error=20.41311494133671


**LASSO REGRESSION**

In [37]:
def run(fold):
    df_train = encoded_df[encoded_df.kfold != fold].reset_index(drop=True)
    df_valid = encoded_df[encoded_df.kfold == fold].reset_index(drop=True)

    x_train = df_train.drop("total", axis=1).values
    y_train = df_train.total.values
 
    x_valid = df_valid.drop("total", axis=1).values
    y_valid = df_valid.total.values

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)
    #print(x_train)
 
    clf = Lasso()
    clf.fit(x_train, y_train)
    preds = clf.predict(x_valid)
 
    error = np.sqrt(mean_squared_error(y_valid, preds))
    print(f"Fold={fold}, error={error}")

In [38]:
run(0)
run(1)
run(2)
run(3)
run(4)

Fold=0, error=21.38926623548069
Fold=1, error=21.442642486277663
Fold=2, error=21.575569700521044
Fold=3, error=21.528302113359874
Fold=4, error=21.36794275220436


In [47]:
#for forest based , try onehot encoding
cat_cols = ['venue','bat_team','bowl_team']
encoding_strategy = ['LabelEncoding','OneHotEncoding']

encoded_df = data_encoding(encoding_strategy[1], df, cat_cols)

OneHotEncoding chosen


In [48]:
encoded_df.head()

Unnamed: 0,runs,wickets,overs,total,kfold,venue_Barabati Stadium,venue_Brabourne Stadium,venue_Buffalo Park,venue_De Beers Diamond Oval,venue_Dr DY Patil Sports Academy,venue_Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,venue_Dubai International Cricket Stadium,venue_Eden Gardens,venue_Feroz Shah Kotla,venue_Himachal Pradesh Cricket Association Stadium,venue_Holkar Cricket Stadium,venue_JSCA International Stadium Complex,venue_Kingsmead,venue_M Chinnaswamy Stadium,"venue_MA Chidambaram Stadium, Chepauk",venue_Maharashtra Cricket Association Stadium,venue_New Wanderers Stadium,venue_Newlands,venue_OUTsurance Oval,"venue_Punjab Cricket Association IS Bindra Stadium, Mohali","venue_Punjab Cricket Association Stadium, Mohali","venue_Rajiv Gandhi International Stadium, Uppal","venue_Sardar Patel Stadium, Motera",venue_Sawai Mansingh Stadium,venue_Shaheed Veer Narayan Singh International Stadium,venue_Sharjah Cricket Stadium,venue_Sheikh Zayed Stadium,venue_St George's Park,venue_Subrata Roy Sahara Stadium,venue_SuperSport Park,"venue_Vidarbha Cricket Association Stadium, Jamtha",venue_Wankhede Stadium,bat_team_Chennai Super Kings,bat_team_Delhi Capitals,bat_team_Kolkata Knight Riders,bat_team_Mumbai Indians,bat_team_Punjab Kings,bat_team_Rajasthan Royals,bat_team_Rising Pune Supergiants,bat_team_Royal Challengers Bangalore,bat_team_Sunrisers Hyderabad,bowl_team_Chennai Super Kings,bowl_team_Delhi Capitals,bowl_team_Kolkata Knight Riders,bowl_team_Mumbai Indians,bowl_team_Punjab Kings,bowl_team_Rajasthan Royals,bowl_team_Rising Pune Supergiants,bowl_team_Royal Challengers Bangalore,bowl_team_Sunrisers Hyderabad
0,30,0,4.1,126,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0
1,46,0,5.2,168,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
2,111,2,14.6,175,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
3,62,6,12.4,123,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,81,3,9.6,173,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [50]:
#avoid dummy trap by dropping first columns of each encoded column
encoded_df = encoded_df.drop(['venue_Barabati Stadium','bat_team_Chennai Super Kings','bowl_team_Chennai Super Kings'],axis =1)
encoded_df.head()

Unnamed: 0,runs,wickets,overs,total,kfold,venue_Brabourne Stadium,venue_Buffalo Park,venue_De Beers Diamond Oval,venue_Dr DY Patil Sports Academy,venue_Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,venue_Dubai International Cricket Stadium,venue_Eden Gardens,venue_Feroz Shah Kotla,venue_Himachal Pradesh Cricket Association Stadium,venue_Holkar Cricket Stadium,venue_JSCA International Stadium Complex,venue_Kingsmead,venue_M Chinnaswamy Stadium,"venue_MA Chidambaram Stadium, Chepauk",venue_Maharashtra Cricket Association Stadium,venue_New Wanderers Stadium,venue_Newlands,venue_OUTsurance Oval,"venue_Punjab Cricket Association IS Bindra Stadium, Mohali","venue_Punjab Cricket Association Stadium, Mohali","venue_Rajiv Gandhi International Stadium, Uppal","venue_Sardar Patel Stadium, Motera",venue_Sawai Mansingh Stadium,venue_Shaheed Veer Narayan Singh International Stadium,venue_Sharjah Cricket Stadium,venue_Sheikh Zayed Stadium,venue_St George's Park,venue_Subrata Roy Sahara Stadium,venue_SuperSport Park,"venue_Vidarbha Cricket Association Stadium, Jamtha",venue_Wankhede Stadium,bat_team_Delhi Capitals,bat_team_Kolkata Knight Riders,bat_team_Mumbai Indians,bat_team_Punjab Kings,bat_team_Rajasthan Royals,bat_team_Rising Pune Supergiants,bat_team_Royal Challengers Bangalore,bat_team_Sunrisers Hyderabad,bowl_team_Delhi Capitals,bowl_team_Kolkata Knight Riders,bowl_team_Mumbai Indians,bowl_team_Punjab Kings,bowl_team_Rajasthan Royals,bowl_team_Rising Pune Supergiants,bowl_team_Royal Challengers Bangalore,bowl_team_Sunrisers Hyderabad
0,30,0,4.1,126,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
1,46,0,5.2,168,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
2,111,2,14.6,175,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3,62,6,12.4,123,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,81,3,9.6,173,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [51]:
#no need scaling for tree based algorithms
from sklearn import tree
from sklearn import ensemble

**DECISION TREE CLASSIFIER**

In [52]:
def run(fold):
    df_train = encoded_df[encoded_df.kfold != fold].reset_index(drop=True)
    df_valid = encoded_df[encoded_df.kfold == fold].reset_index(drop=True)

    x_train = df_train.drop("total", axis=1).values
    y_train = df_train.total.values
 
    x_valid = df_valid.drop("total", axis=1).values
    y_valid = df_valid.total.values
 
    clf = tree.DecisionTreeRegressor()
    clf.fit(x_train, y_train)
    preds = clf.predict(x_valid)
 
    error = np.sqrt(mean_squared_error(y_valid, preds))
    print(f"Fold={fold}, error={error}")

In [53]:
run(0)
run(1)
run(2)
run(3)
run(4)

Fold=0, error=13.75747666294907
Fold=1, error=14.246283271013292
Fold=2, error=13.406299710991592
Fold=3, error=13.272566034561827
Fold=4, error=13.706636085908148


**RANDOM FOREST**

In [54]:
def run(fold):
    df_train = encoded_df[encoded_df.kfold != fold].reset_index(drop=True)
    df_valid = encoded_df[encoded_df.kfold == fold].reset_index(drop=True)

    x_train = df_train.drop("total", axis=1).values
    y_train = df_train.total.values
 
    x_valid = df_valid.drop("total", axis=1).values
    y_valid = df_valid.total.values
 
    clf = ensemble.RandomForestRegressor()
    clf.fit(x_train, y_train)
    preds = clf.predict(x_valid)
 
    error = np.sqrt(mean_squared_error(y_valid, preds))
    print(f"Fold={fold}, error={error}")

In [55]:
run(0)
run(1)
run(2)
run(3)
run(4)

Fold=0, error=9.86761729750991
Fold=1, error=9.987971919926087
Fold=2, error=10.110568034577348
Fold=3, error=9.97159862157032
Fold=4, error=10.100877984020388


In [None]:
#random forest seems to perform the best, lets take it and hypertune it