## Data Pre-Preprocessing and cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv("Batsmen.csv")
data.head()

Unnamed: 0,Innings Player,Innings Runs Scored,Innings Runs Scored Num,Innings Minutes Batted,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Boundary Fours,Innings Boundary Sixes,Innings Batting Strike Rate,...,Innings Overs Bowled,Innings Bowled Flag,Innings Maidens Bowled,Innings Runs Conceded,Innings Wickets Taken,4 Wickets,5 Wickets,10 Wickets,Innings Wickets Taken Buckets,Innings Economy Rate
0,JJ Roy,180,180,151,1.0,0.0,151,16,5,119.2,...,,,,,,,,,,
1,AD Hales,171,171,166,1.0,0.0,122,22,4,140.16,...,,,,,,,,,,
2,JJ Roy,162,162,172,1.0,0.0,118,13,3,137.28,...,,,,,,,,,,
3,AJ Strauss,158,158,188,1.0,0.0,145,18,1,108.96,...,,,,,,,,,,
4,AJ Strauss,154,154,201,1.0,0.0,140,16,5,110.0,...,,,,,,,,,,


In [2]:
data["Innings Runs Scored Num"].fillna("0",inplace = True)
data["Innings Runs Scored Num"].replace("-","0",inplace = True)
data["Innings Balls Faced"].replace("-","0",inplace = True)
data.fillna(0,inplace = True)
data["Innings Balls Faced"] = data["Innings Balls Faced"].astype(int)
data["Innings Runs Scored Num"] = data["Innings Runs Scored Num"].astype(int)

In [3]:
player_runs = data.groupby("Innings Player")["Innings Runs Scored Num"].sum()
player_innings = data.groupby("Innings Player")['Innings Batted Flag'].sum()
player_notout = data.groupby("Innings Player")['Innings Not Out Flag'].sum()
player_ball_faced  = data.groupby("Innings Player")['Innings Balls Faced'].sum()
player_50  =data.groupby("Innings Player")["50's"].sum()
player_100  =data.groupby("Innings Player")["100's"].sum()

In [4]:
data.head()

Unnamed: 0,Innings Player,Innings Runs Scored,Innings Runs Scored Num,Innings Minutes Batted,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Boundary Fours,Innings Boundary Sixes,Innings Batting Strike Rate,...,Innings Overs Bowled,Innings Bowled Flag,Innings Maidens Bowled,Innings Runs Conceded,Innings Wickets Taken,4 Wickets,5 Wickets,10 Wickets,Innings Wickets Taken Buckets,Innings Economy Rate
0,JJ Roy,180,180,151,1.0,0.0,151,16,5,119.2,...,0,0.0,0,0,0,0.0,0.0,0.0,0,0
1,AD Hales,171,171,166,1.0,0.0,122,22,4,140.16,...,0,0.0,0,0,0,0.0,0.0,0.0,0,0
2,JJ Roy,162,162,172,1.0,0.0,118,13,3,137.28,...,0,0.0,0,0,0,0.0,0.0,0.0,0,0
3,AJ Strauss,158,158,188,1.0,0.0,145,18,1,108.96,...,0,0.0,0,0,0,0.0,0.0,0.0,0,0
4,AJ Strauss,154,154,201,1.0,0.0,140,16,5,110.0,...,0,0.0,0,0,0,0.0,0.0,0.0,0,0


In [5]:
data.columns
to_del = ['Innings Runs Scored','Innings Minutes Batted','Innings Boundary Fours','Innings Boundary Sixes','4 Wickets',
         "5 Wickets",'Innings Date','Innings Runs Scored Buckets','Innings Wickets Taken Buckets']
data.drop(columns=to_del,axis=1,inplace=True)
data.columns

Index(['Innings Player', 'Innings Runs Scored Num', 'Innings Batted Flag',
       'Innings Not Out Flag', 'Innings Balls Faced',
       'Innings Batting Strike Rate', 'Innings Number', 'Opposition', 'Ground',
       'Country', '50's', '100's', 'Innings Overs Bowled',
       'Innings Bowled Flag', 'Innings Maidens Bowled',
       'Innings Runs Conceded', 'Innings Wickets Taken', '10 Wickets',
       'Innings Economy Rate'],
      dtype='object')

In [6]:
d = {"runs": player_runs,
 "innings":player_innings,
     "not_out":player_notout,
     "balls_faced":player_ball_faced,
     "50's":player_50,
     "100's":player_100,
}
df = pd.DataFrame(d)
df.head()

Unnamed: 0_level_0,runs,innings,not_out,balls_faced,50's,100's
Innings Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A Balbirnie,5730,192.0,12.0,7590,27.0,15.0
A Bhandari,0,0.0,0.0,0,0.0,0.0
A Dananjaya,849,81.0,15.0,1233,3.0,0.0
A Flintoff,8961,312.0,45.0,10005,48.0,9.0
A Flower,6147,159.0,15.0,7506,48.0,6.0


In [7]:
data['runs_after'] = data.groupby("Innings Player")["Innings Runs Scored Num"].cumsum()
data['innings_after']= data.groupby("Innings Player")['Innings Batted Flag'].cumsum()
data['not_out_after'] = data.groupby("Innings Player")['Innings Not Out Flag'].cumsum()
data['balls_face_after'] = data.groupby("Innings Player")['Innings Balls Faced'].cumsum()
data['50s_after']  =data.groupby("Innings Player")["50's"].cumsum()
data['100s_after']  =data.groupby("Innings Player")["100's"].cumsum()
data['avg_after'] = data['runs_after'] / (data["innings_after"]- data["not_out_after"])
data["s/r_after"] = (data['runs_after']/data['balls_face_after'])*100

In [9]:
data['runs_before'] = data['runs_after']-data['Innings Runs Scored Num']
data['innings_before']= data['innings_after'] - 1
data['not_out_before'] = data["not_out_after"] -1
data['balls_face_before'] = data["balls_face_after"]-data["Innings Balls Faced"]
data['50s_before']  =data['50s_after'] - 1
data['100s_before']  =data['100s_after'] - 1
data['50s_before'].replace(-1,0,inplace = True)
data['100s_before'].replace(-1,0,inplace = True)
data['not_out_before'].replace(-1,0,inplace = True)
data['avg_before'] = data['runs_before'] / (data["innings_before"]- data["not_out_before"])
data["s/r_before"] = (data['runs_before']/data['balls_face_before'])*100
data['avg_before'].fillna(30,inplace=True)
data["s/r_before"].fillna(90,inplace=True)
data["innings_before"].replace(-1,0,inplace = True)

In [10]:
data.head(50)

Unnamed: 0,Innings Player,Innings Runs Scored Num,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Batting Strike Rate,Innings Number,Opposition,Ground,Country,...,avg_after,s/r_after,runs_before,innings_before,not_out_before,balls_face_before,50s_before,100s_before,avg_before,s/r_before
0,JJ Roy,180,1.0,0.0,151,119.2,2,v Australia,Melbourne,England,...,180.0,119.205298,0,0.0,0.0,0,0.0,0.0,30.0,90.0
1,AD Hales,171,1.0,0.0,122,140.16,1,v Pakistan,Nottingham,England,...,171.0,140.163934,0,0.0,0.0,0,0.0,0.0,30.0,90.0
2,JJ Roy,162,1.0,0.0,118,137.28,2,v Sri Lanka,The Oval,England,...,171.0,127.137546,180,1.0,0.0,151,0.0,1.0,180.0,119.205298
3,AJ Strauss,158,1.0,0.0,145,108.96,2,v India,Bengaluru,England,...,158.0,108.965517,0,0.0,0.0,0,0.0,0.0,30.0,90.0
4,AJ Strauss,154,1.0,0.0,140,110.0,1,v Bangladesh,Birmingham,England,...,156.0,109.473684,158,1.0,0.0,145,0.0,1.0,158.0,108.965517
5,JJ Roy,153,1.0,0.0,121,126.44,1,v Bangladesh,Cardiff,England,...,165.0,126.923077,342,2.0,0.0,269,0.0,2.0,171.0,127.137546
6,AJ Strauss,152,1.0,0.0,128,118.75,1,v Bangladesh,Nottingham,England,...,154.666667,112.348668,312,2.0,0.0,285,0.0,2.0,156.0,109.473684
7,JC Buttler,150,1.0,0.0,77,194.8,1,v West Indies,St George's,England,...,150.0,194.805195,0,0.0,0.0,0,0.0,0.0,30.0,90.0
8,EJG Morgan,148,1.0,0.0,71,208.45,1,v Afghanistan,Manchester,England,...,148.0,208.450704,0,0.0,0.0,0,0.0,0.0,30.0,90.0
9,AD Hales,147,1.0,0.0,92,159.78,1,v Australia,Nottingham,England,...,159.0,148.598131,171,1.0,0.0,122,0.0,1.0,171.0,140.163934


In [11]:
l = []
for i in range(len(data['avg_after'])):
    if data['avg_after'][i] == np.inf:
        l.append(data['runs_after'][i])
    else:
        l.append(data['avg_after'][i])
data['avg_after'] = l
l = []
for i in range(len(data['avg_before'])):
    if data['avg_before'][i] == np.inf:
        l.append(data['runs_before'][i])
    else:
        l.append(data['avg_before'][i])
data['avg_before'] = l
data.head(100)

Unnamed: 0,Innings Player,Innings Runs Scored Num,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Batting Strike Rate,Innings Number,Opposition,Ground,Country,...,avg_after,s/r_after,runs_before,innings_before,not_out_before,balls_face_before,50s_before,100s_before,avg_before,s/r_before
0,JJ Roy,180,1.0,0.0,151,119.20,2,v Australia,Melbourne,England,...,180.000000,119.205298,0,0.0,0.0,0,0.0,0.0,30.000000,90.000000
1,AD Hales,171,1.0,0.0,122,140.16,1,v Pakistan,Nottingham,England,...,171.000000,140.163934,0,0.0,0.0,0,0.0,0.0,30.000000,90.000000
2,JJ Roy,162,1.0,0.0,118,137.28,2,v Sri Lanka,The Oval,England,...,171.000000,127.137546,180,1.0,0.0,151,0.0,1.0,180.000000,119.205298
3,AJ Strauss,158,1.0,0.0,145,108.96,2,v India,Bengaluru,England,...,158.000000,108.965517,0,0.0,0.0,0,0.0,0.0,30.000000,90.000000
4,AJ Strauss,154,1.0,0.0,140,110.00,1,v Bangladesh,Birmingham,England,...,156.000000,109.473684,158,1.0,0.0,145,0.0,1.0,158.000000,108.965517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,KP Pietersen,104,1.0,0.0,122,85.24,1,v Australia,North Sound,England,...,263.333333,96.459096,686,6.0,3.0,697,0.0,6.0,228.666667,98.421808
96,JE Root,104,1.0,0.0,78,133.33,1,v New Zealand,Birmingham,England,...,177.428571,102.138158,1138,10.0,3.0,1138,0.0,10.0,162.571429,100.000000
97,JM Bairstow,104,1.0,0.0,60,173.33,2,v New Zealand,Christchurch,England,...,138.857143,132.786885,868,7.0,0.0,672,0.0,7.0,124.000000,129.166667
98,EJG Morgan,103,1.0,1.0,85,121.17,2,v Australia,Southampton,England,...,191.000000,115.407855,1043,9.0,3.0,908,0.0,9.0,173.833333,114.867841


Consistency = 0.4262*average + 0.2566*no. of innings + 0.1510*SR + 0.0787*Centuries + 0.0556*Fifties – 0.0328*Zeros <br>
Form = 0.4262*average + 0.2566*no. of innings + 0.1510*SR + 0.0787*Centuries + 0.0556*Fifties – 0.0328*Zeros

In [12]:
#del(data['consistency'])
f = [(0.4262*data['avg_before'][i]+0.1510*data["s/r_before"][i]+0.2566*data["innings_before"][i]+ 0.0556*data['50s_before'][i]+0.0787*data["100s_before"][i]) for i in range(len(data['100s_before']))]
data['form'] = f
data.head()

Unnamed: 0,Innings Player,Innings Runs Scored Num,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Batting Strike Rate,Innings Number,Opposition,Ground,Country,...,s/r_after,runs_before,innings_before,not_out_before,balls_face_before,50s_before,100s_before,avg_before,s/r_before,form
0,JJ Roy,180,1.0,0.0,151,119.2,2,v Australia,Melbourne,England,...,119.205298,0,0.0,0.0,0,0.0,0.0,30.0,90.0,26.376
1,AD Hales,171,1.0,0.0,122,140.16,1,v Pakistan,Nottingham,England,...,140.163934,0,0.0,0.0,0,0.0,0.0,30.0,90.0,26.376
2,JJ Roy,162,1.0,0.0,118,137.28,2,v Sri Lanka,The Oval,England,...,127.137546,180,1.0,0.0,151,0.0,1.0,180.0,119.205298,95.0513
3,AJ Strauss,158,1.0,0.0,145,108.96,2,v India,Bengaluru,England,...,108.965517,0,0.0,0.0,0,0.0,0.0,30.0,90.0,26.376
4,AJ Strauss,154,1.0,0.0,140,110.0,1,v Bangladesh,Birmingham,England,...,109.473684,158,1.0,0.0,145,0.0,1.0,158.0,108.965517,84.128693


In [21]:
#data.to_csv("clean_data.csv",index = False)

In [2]:
data = pd.read_csv("clean_data.csv")
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Innings Player,Innings Runs Scored Num,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Batting Strike Rate,Innings Number,Opposition,Ground,Country,...,s/r_after,runs_before,innings_before,not_out_before,balls_face_before,50s_before,100s_before,avg_before,s/r_before,form
0,JJ Roy,180,1.0,0.0,151,119.2,2,v Australia,Melbourne,England,...,119.205298,0,0.0,0.0,0,0.0,0.0,30.0,90.0,26.376
1,AD Hales,171,1.0,0.0,122,140.16,1,v Pakistan,Nottingham,England,...,140.163934,0,0.0,0.0,0,0.0,0.0,30.0,90.0,26.376
2,JJ Roy,162,1.0,0.0,118,137.28,2,v Sri Lanka,The Oval,England,...,127.137546,180,1.0,0.0,151,0.0,1.0,180.0,119.205298,95.0513
3,AJ Strauss,158,1.0,0.0,145,108.96,2,v India,Bengaluru,England,...,108.965517,0,0.0,0.0,0,0.0,0.0,30.0,90.0,26.376
4,AJ Strauss,154,1.0,0.0,140,110.0,1,v Bangladesh,Birmingham,England,...,109.473684,158,1.0,0.0,145,0.0,1.0,158.0,108.965517,84.128693


1 – 49: 1 50 – 99: 2 100 – 124: 3 125 – 149: 4 >=150: 5 

In [13]:
form = []
for i in data["form"]:
    if i < 50 : form.append(1)
    elif 50 <= i <100 : form.append(2)
    elif 100 <= i <125 : form.append(3)
    elif 125 <= i <150 : form.append(4)
    else: form.append(5)
        
batavg = []
for i in data["avg_before"]:
    if i < 10 : batavg.append(1)
    elif 10 <= i <20 : batavg.append(2)
    elif 20 <= i <30 : batavg.append(3)
    elif 30<= i <40 : batavg.append(4)
    else: batavg.append(5)
        
sr = []
for i in data["s/r_before"]:
    if i < 50 : sr.append(1)
    elif 50 <= i <60 : sr.append(2)
    elif 60 <= i <80 : sr.append(3)
    elif 80 <= i <100 : sr.append(4)
    else: sr.append(5)
data["form_cat"] = form
data["avg_cat"] = batavg
data["sr_cat"] = sr

In [4]:
data

Unnamed: 0,Innings Player,Innings Runs Scored Num,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Batting Strike Rate,Innings Number,Opposition,Ground,Country,...,not_out_before,balls_face_before,50s_before,100s_before,avg_before,s/r_before,form,form_cat,avg_cat,sr_cat
0,JJ Roy,180,1.0,0.0,151,119.20,2,v Australia,Melbourne,England,...,0.0,0,0.0,0.0,30.000000,90.000000,26.376000,1,4,4
1,AD Hales,171,1.0,0.0,122,140.16,1,v Pakistan,Nottingham,England,...,0.0,0,0.0,0.0,30.000000,90.000000,26.376000,1,4,4
2,JJ Roy,162,1.0,0.0,118,137.28,2,v Sri Lanka,The Oval,England,...,0.0,151,0.0,1.0,180.000000,119.205298,95.051300,2,5,5
3,AJ Strauss,158,1.0,0.0,145,108.96,2,v India,Bengaluru,England,...,0.0,0,0.0,0.0,30.000000,90.000000,26.376000,1,4,4
4,AJ Strauss,154,1.0,0.0,140,110.00,1,v Bangladesh,Birmingham,England,...,0.0,145,0.0,1.0,158.000000,108.965517,84.128693,2,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327323,Najibullah Zadran,0,0.0,0.0,0,0,2,v West Indies,Lucknow,Afghanistan,...,26.0,5295,32.0,2.0,29.924528,89.858357,75.730046,2,3,4
327324,Rahmat Shah,0,0.0,0.0,0,0,2,v West Indies,Lucknow,Afghanistan,...,5.0,10098,47.0,11.0,35.208955,70.083185,81.927118,2,4,3
327325,Hazratullah Zazai,0,0.0,0.0,0,0,2,v West Indies,Lucknow,Afghanistan,...,0.0,1227,5.0,0.0,23.042553,88.264059,35.486809,1,3,4
327326,Ibrahim Zadran,0,0.0,0.0,0,0,2,v West Indies,Lucknow,Afghanistan,...,0.0,27,0.0,0.0,3.000000,22.222222,5.147356,1,1,1


In [14]:
run_cat = []
for i in data["Innings Runs Scored Num"]:
    if i < 25 : run_cat.append(1)
    elif 25 <= i <50 : run_cat.append(2)
    elif 50 <= i <75 : run_cat.append(3)
    elif 75 <= i <100 : run_cat.append(4)
    else: run_cat.append(5)
data["run_class"] = run_cat

In [15]:
data.head()

Unnamed: 0,Innings Player,Innings Runs Scored Num,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Batting Strike Rate,Innings Number,Opposition,Ground,Country,...,balls_face_before,50s_before,100s_before,avg_before,s/r_before,form,form_cat,avg_cat,sr_cat,run_class
0,JJ Roy,180,1.0,0.0,151,119.2,2,v Australia,Melbourne,England,...,0,0.0,0.0,30.0,90.0,26.376,1,4,4,5
1,AD Hales,171,1.0,0.0,122,140.16,1,v Pakistan,Nottingham,England,...,0,0.0,0.0,30.0,90.0,26.376,1,4,4,5
2,JJ Roy,162,1.0,0.0,118,137.28,2,v Sri Lanka,The Oval,England,...,151,0.0,1.0,180.0,119.205298,95.0513,2,5,5,5
3,AJ Strauss,158,1.0,0.0,145,108.96,2,v India,Bengaluru,England,...,0,0.0,0.0,30.0,90.0,26.376,1,4,4,5
4,AJ Strauss,154,1.0,0.0,140,110.0,1,v Bangladesh,Birmingham,England,...,145,0.0,1.0,158.0,108.965517,84.128693,2,5,5,5


## Training Model

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
data.columns
X = ['sr_cat','avg_cat','form_cat','100s_before','50s_before','not_out_before','innings_before','runs_before']
y = data['run_class']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(data[X], y, test_size=0.2, random_state=32)

In [19]:
from sklearn.naive_bayes import CategoricalNB

In [20]:
X_train.fillna(0,inplace = True)
X_test.fillna(0,inplace = True)
X_train.loc[X_train["innings_before"]<0]
X_test.loc[X_test["innings_before"]<0]
X_train["innings_before"].replace(-1,0,inplace = True)
X_test["innings_before"].replace(-1,0,inplace = True)


## Model for Naive Bayes and RandomForest

In [21]:
gnb = CategoricalNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score

### Naive Bayes

In [23]:
accuracy_score(y_test, y_pred)

0.8379616900375768

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
rfc = RandomForestClassifier()
pred = rfc.fit(X_train, y_train).predict(X_test)

### Random Forest 

In [26]:
accuracy_score(y_test, pred)

0.9679986557907921