## Cricket Batting Performance Prediction (One Day Internationals)

In [22]:
from IPython.display import Image
Image(url='https://bhopali2much.files.wordpress.com/2017/05/5f00f-virat-kohli-jab-six-vs-england-1st-odi-2016-video.gif')

In [1]:
# Libraries

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### Importing Data

In [3]:
dataset=pd.read_csv("ODI.csv")
dataset.head(5)

Unnamed: 0,Innings Player,Innings Runs Scored,Innings Runs Scored Num,Innings Minutes Batted,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Boundary Fours,Innings Boundary Sixes,Innings Batting Strike Rate,...,Innings Overs Bowled,Innings Bowled Flag,Innings Maidens Bowled,Innings Runs Conceded,Innings Wickets Taken,4 Wickets,5 Wickets,10 Wickets,Innings Wickets Taken Buckets,Innings Economy Rate
0,JJ Roy,180,180,151,1.0,0.0,151,16,5,119.2,...,,,,,,,,,,
1,AD Hales,171,171,166,1.0,0.0,122,22,4,140.16,...,,,,,,,,,,
2,JJ Roy,162,162,172,1.0,0.0,118,13,3,137.28,...,,,,,,,,,,
3,AJ Strauss,158,158,188,1.0,0.0,145,18,1,108.96,...,,,,,,,,,,
4,AJ Strauss,154,154,201,1.0,0.0,140,16,5,110.0,...,,,,,,,,,,


# Data Cleaning

### Excluding the bowling columns
Here the bowling columns are eliminated as we are predicting performance of batsman only

In [4]:
dataset=dataset.iloc[:,0:18]
dataset.head(5)

Unnamed: 0,Innings Player,Innings Runs Scored,Innings Runs Scored Num,Innings Minutes Batted,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Boundary Fours,Innings Boundary Sixes,Innings Batting Strike Rate,Innings Number,Opposition,Ground,Innings Date,Country,50s,100s,Innings Runs Scored Buckets
0,JJ Roy,180,180,151,1.0,0.0,151,16,5,119.2,2,Australia,Melbourne,14-01-2018,England,0.0,1.0,150-199
1,AD Hales,171,171,166,1.0,0.0,122,22,4,140.16,1,Pakistan,Nottingham,30-08-2016,England,0.0,1.0,150-199
2,JJ Roy,162,162,172,1.0,0.0,118,13,3,137.28,2,Sri Lanka,The Oval,29-06-2016,England,0.0,1.0,150-199
3,AJ Strauss,158,158,188,1.0,0.0,145,18,1,108.96,2,India,Bengaluru,27-02-2011,England,0.0,1.0,150-199
4,AJ Strauss,154,154,201,1.0,0.0,140,16,5,110.0,1,Bangladesh,Birmingham,12-07-2010,England,0.0,1.0,150-199


## Dropping duplicates and null values

In [5]:
dataset=dataset.drop_duplicates()
dataset=dataset.dropna()
dataset.head(5)

Unnamed: 0,Innings Player,Innings Runs Scored,Innings Runs Scored Num,Innings Minutes Batted,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Boundary Fours,Innings Boundary Sixes,Innings Batting Strike Rate,Innings Number,Opposition,Ground,Innings Date,Country,50s,100s,Innings Runs Scored Buckets
0,JJ Roy,180,180,151,1.0,0.0,151,16,5,119.2,2,Australia,Melbourne,14-01-2018,England,0.0,1.0,150-199
1,AD Hales,171,171,166,1.0,0.0,122,22,4,140.16,1,Pakistan,Nottingham,30-08-2016,England,0.0,1.0,150-199
2,JJ Roy,162,162,172,1.0,0.0,118,13,3,137.28,2,Sri Lanka,The Oval,29-06-2016,England,0.0,1.0,150-199
3,AJ Strauss,158,158,188,1.0,0.0,145,18,1,108.96,2,India,Bengaluru,27-02-2011,England,0.0,1.0,150-199
4,AJ Strauss,154,154,201,1.0,0.0,140,16,5,110.0,1,Bangladesh,Birmingham,12-07-2010,England,0.0,1.0,150-199


### There are some special characters in the dataset, replacing those to zero

In [6]:
dataset=dataset.replace('-', 0)

# Data Pre-processing

### The datatype is object for all the columns, converting it to integer

In [7]:
dataset.dtypes

Innings Player                  object
Innings Runs Scored             object
Innings Runs Scored Num         object
Innings Minutes Batted          object
Innings Batted Flag            float64
Innings Not Out Flag           float64
Innings Balls Faced             object
Innings Boundary Fours          object
Innings Boundary Sixes          object
Innings Batting Strike Rate     object
Innings Number                  object
Opposition                      object
Ground                          object
Innings Date                    object
Country                         object
50s                            float64
100s                           float64
Innings Runs Scored Buckets     object
dtype: object

In [8]:
# pd.to_numeric( )

dataset['Innings Runs Scored Num'] = pd.to_numeric(dataset['Innings Runs Scored Num']) 

dataset['Innings Minutes Batted']= pd.to_numeric(dataset['Innings Minutes Batted']) 
dataset['Innings Balls Faced']= pd.to_numeric(dataset['Innings Balls Faced']) 

dataset['Innings Boundary Fours']= pd.to_numeric(dataset['Innings Boundary Fours']) 

dataset['Innings Boundary Sixes']= pd.to_numeric(dataset['Innings Boundary Sixes']) 


dataset['Innings Batting Strike Rate']= pd.to_numeric(dataset['Innings Batting Strike Rate'])


dataset['Innings Number']= pd.to_numeric(dataset['Innings Number'])


dataset['50s']= pd.to_numeric(dataset['50s'])

#converting Innings date from object to date
dataset['Innings Date'] = pd.to_datetime(dataset['Innings Date'])

#### Extracting year from date column. The year column becomes important to see how player performs year wise

In [9]:
dataset['year'] = pd.DatetimeIndex(dataset['Innings Date']).year
dataset.head()

Unnamed: 0,Innings Player,Innings Runs Scored,Innings Runs Scored Num,Innings Minutes Batted,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Boundary Fours,Innings Boundary Sixes,Innings Batting Strike Rate,Innings Number,Opposition,Ground,Innings Date,Country,50s,100s,Innings Runs Scored Buckets,year
0,JJ Roy,180,180,151,1.0,0.0,151,16,5,119.2,2,Australia,Melbourne,2018-01-14,England,0.0,1.0,150-199,2018
1,AD Hales,171,171,166,1.0,0.0,122,22,4,140.16,1,Pakistan,Nottingham,2016-08-30,England,0.0,1.0,150-199,2016
2,JJ Roy,162,162,172,1.0,0.0,118,13,3,137.28,2,Sri Lanka,The Oval,2016-06-29,England,0.0,1.0,150-199,2016
3,AJ Strauss,158,158,188,1.0,0.0,145,18,1,108.96,2,India,Bengaluru,2011-02-27,England,0.0,1.0,150-199,2011
4,AJ Strauss,154,154,201,1.0,0.0,140,16,5,110.0,1,Bangladesh,Birmingham,2010-12-07,England,0.0,1.0,150-199,2010


### Extracting country name on the basis of grounds Example: Melbourne(M.C.G) is in Australia so the  country_played will be Australia

In [10]:
# CountryWise


cities=pd.read_csv("worldcities.csv")
cities=cities.iloc[:,0:2]
cities=cities.dropna()
cities.head(5)


Unnamed: 0,Ground,country
0,Melbourne,Australia
2,Kennington,England
4,Sydney,Australia
6,Manchester,England
8,St. John's Wood,England


In [11]:
# merging cities dataframe with dataset
mergedDf = dataset.merge(cities, on='Ground')
dataset=mergedDf.drop_duplicates()
dataset = dataset.rename(columns={'country': 'Country_Played'})
dataset.head(5)

Unnamed: 0,Innings Player,Innings Runs Scored,Innings Runs Scored Num,Innings Minutes Batted,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Boundary Fours,Innings Boundary Sixes,Innings Batting Strike Rate,Innings Number,Opposition,Ground,Innings Date,Country,50s,100s,Innings Runs Scored Buckets,year,Country_Played
0,JJ Roy,180,180,151,1.0,0.0,151,16,5,119.2,2,Australia,Melbourne,2018-01-14,England,0.0,1.0,150-199,2018,Australia
1,PD Collingwood,120*,120,222,1.0,1.0,133,7,1,90.22,2,Australia,Melbourne,2007-09-02,England,0.0,1.0,100-149,2007,Australia
2,JWA Taylor,98*,98,126,1.0,1.0,90,11,2,108.88,2,Australia,Melbourne,2015-02-14,England,1.0,0.0,50-99,2015,Australia
3,JE Root,91*,91,186,1.0,1.0,110,5,0,82.72,2,Australia,Melbourne,2018-01-14,England,1.0,0.0,50-99,2018,Australia
4,KP Pietersen,82,82,143,1.0,0.0,91,4,3,90.1,1,Australia,Melbourne,2007-12-01,England,1.0,0.0,50-99,2007,Australia


In [16]:
#Continent played in
dataset['continent'] = np.where( (dataset['Country_Played']=='India') |\
                                (dataset['Country_Played']=='Sri Lanka')|\
                                (dataset['Country_Played']=='United Arab Emirates')|\
                                (dataset['Country_Played']=='Bangladesh')| \
                                (dataset['Country_Played']=='Pakistan'),\
                                'Asia', 'Outside Asia')
dataset.head(2)

Unnamed: 0,Innings Player,Innings Runs Scored,Innings Runs Scored Num,Innings Minutes Batted,Innings Batted Flag,Innings Not Out Flag,Innings Balls Faced,Innings Boundary Fours,Innings Boundary Sixes,Innings Batting Strike Rate,...,Opposition,Ground,Innings Date,Country,50s,100s,Innings Runs Scored Buckets,year,Country_Played,continent
0,JJ Roy,180,180,151,1.0,0.0,151,16,5,119.2,...,Australia,Melbourne,2018-01-14,England,0.0,1.0,150-199,2018,Australia,Outside Asia
1,PD Collingwood,120*,120,222,1.0,1.0,133,7,1,90.22,...,Australia,Melbourne,2007-09-02,England,0.0,1.0,100-149,2007,Australia,Outside Asia


## Writing to excel file

In [19]:
from pandas import ExcelWriter

writer = ExcelWriter('CLeaned_data.xlsx')
dataset.to_excel(writer,'Sheet1')
writer.save()