In [2]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('./data_pd/MS_Dhoni_ODI_record.csv')

In [4]:
df.head()

Unnamed: 0,score,runs_scored,balls_faced,strike_rate,fours,sixes,opposition,ground,date,odi_number
0,0,0,1,0.0,0,0,v Bangladesh,Chattogram,23/12/2004,ODI # 2199
1,12,12,11,109.09,2,0,v Bangladesh,Dhaka,26/12/2004,ODI # 2201
2,7*,7,2,350.0,0,1,v Bangladesh,Dhaka,27/12/2004,ODI # 2202
3,3,3,7,42.85,0,0,v Pakistan,Kochi,02/04/2005,ODI # 2235
4,148,148,123,120.32,15,4,v Pakistan,Visakhapatnam,05/04/2005,ODI # 2236


In [5]:
# Data cleaning - Opposition name says 'v Aus' etc we have to remove 'v'
#df['opposition'] = df['opposition'].apply(lambda x:x[2:])

df['opposition'] = df['opposition'].str.replace('v','', regex=False)
# Regex=False means that first string ('v ') is not a regex, but a literal string
df['opposition']

0        Bangladesh
1        Bangladesh
2        Bangladesh
3          Pakistan
4          Pakistan
           ...     
345     West Indies
346         England
347      Bangladesh
348       Sri Lanka
349     New Zealand
Name: opposition, Length: 350, dtype: object

In [6]:
# Add a 'feature'  - 'year' column using the match date column
# First convert date column intp datetiem format
df['date'] = pd.to_datetime(df['date'], dayfirst=True)
df['year'] = df['date'].dt.year.astype(int)
# df['year']
df['date']
df

Unnamed: 0,score,runs_scored,balls_faced,strike_rate,fours,sixes,opposition,ground,date,odi_number,year
0,0,0,1,0,0,0,Bangladesh,Chattogram,2004-12-23,ODI # 2199,2004
1,12,12,11,109.09,2,0,Bangladesh,Dhaka,2004-12-26,ODI # 2201,2004
2,7*,7,2,350,0,1,Bangladesh,Dhaka,2004-12-27,ODI # 2202,2004
3,3,3,7,42.85,0,0,Pakistan,Kochi,2005-04-02,ODI # 2235,2005
4,148,148,123,120.32,15,4,Pakistan,Visakhapatnam,2005-04-05,ODI # 2236,2005
...,...,...,...,...,...,...,...,...,...,...,...
345,56*,56,61,91.8,3,2,West Indies,Manchester,2019-06-27,ODI # 4175,2019
346,42*,42,31,135.48,4,1,England,Birmingham,2019-06-30,ODI # 4179,2019
347,35,35,33,106.06,4,0,Bangladesh,Birmingham,2019-07-02,ODI # 4182,2019
348,DNB,-,-,-,-,-,Sri Lanka,Leeds,2019-07-06,ODI # 4187,2019


In [7]:
# Creating a new col to distinguish between outand not out
df['score'] = df['score'].apply(str)
df['not_out'] = np.where(df['score'].str.endswith('*'),1,0)
df

Unnamed: 0,score,runs_scored,balls_faced,strike_rate,fours,sixes,opposition,ground,date,odi_number,year,not_out
0,0,0,1,0,0,0,Bangladesh,Chattogram,2004-12-23,ODI # 2199,2004,0
1,12,12,11,109.09,2,0,Bangladesh,Dhaka,2004-12-26,ODI # 2201,2004,0
2,7*,7,2,350,0,1,Bangladesh,Dhaka,2004-12-27,ODI # 2202,2004,1
3,3,3,7,42.85,0,0,Pakistan,Kochi,2005-04-02,ODI # 2235,2005,0
4,148,148,123,120.32,15,4,Pakistan,Visakhapatnam,2005-04-05,ODI # 2236,2005,0
...,...,...,...,...,...,...,...,...,...,...,...,...
345,56*,56,61,91.8,3,2,West Indies,Manchester,2019-06-27,ODI # 4175,2019,1
346,42*,42,31,135.48,4,1,England,Birmingham,2019-06-30,ODI # 4179,2019,1
347,35,35,33,106.06,4,0,Bangladesh,Birmingham,2019-07-02,ODI # 4182,2019,0
348,DNB,-,-,-,-,-,Sri Lanka,Leeds,2019-07-06,ODI # 4187,2019,0


In [8]:
# dropping the odi_number feature because it adds no value to the analysis
df.drop(columns='odi_number', inplace=True) #if you don't use inplace=True then the change will be temporary in the dataframe
# OR
# df = df.drop(columns='odi_number')
df

Unnamed: 0,score,runs_scored,balls_faced,strike_rate,fours,sixes,opposition,ground,date,year,not_out
0,0,0,1,0,0,0,Bangladesh,Chattogram,2004-12-23,2004,0
1,12,12,11,109.09,2,0,Bangladesh,Dhaka,2004-12-26,2004,0
2,7*,7,2,350,0,1,Bangladesh,Dhaka,2004-12-27,2004,1
3,3,3,7,42.85,0,0,Pakistan,Kochi,2005-04-02,2005,0
4,148,148,123,120.32,15,4,Pakistan,Visakhapatnam,2005-04-05,2005,0
...,...,...,...,...,...,...,...,...,...,...,...
345,56*,56,61,91.8,3,2,West Indies,Manchester,2019-06-27,2019,1
346,42*,42,31,135.48,4,1,England,Birmingham,2019-06-30,2019,1
347,35,35,33,106.06,4,0,Bangladesh,Birmingham,2019-07-02,2019,0
348,DNB,-,-,-,-,-,Sri Lanka,Leeds,2019-07-06,2019,0


In [9]:
# dropping those innings where Dhoni did not bat and storing it in  a new Dataframe
# Take all the columns, starting with run_scored
df_new = df.loc[((df['score'] != 'DNB') & (df['score'] != 'TNDB')), 'runs_scored':]
df_new.head()

Unnamed: 0,runs_scored,balls_faced,strike_rate,fours,sixes,opposition,ground,date,year,not_out
0,0,1,0.0,0,0,Bangladesh,Chattogram,2004-12-23,2004,0
1,12,11,109.09,2,0,Bangladesh,Dhaka,2004-12-26,2004,0
2,7,2,350.0,0,1,Bangladesh,Dhaka,2004-12-27,2004,1
3,3,7,42.85,0,0,Pakistan,Kochi,2005-04-02,2005,0
4,148,123,120.32,15,4,Pakistan,Visakhapatnam,2005-04-05,2005,0


In [10]:
df_new = df_new[df_new['runs_scored'].str.contains('-') != True]

In [11]:
# fixing the data types of the numerical columns
df_new['runs_scored'] = df_new['runs_scored'].astype(int)
df_new['balls_faced'] = df_new['balls_faced'].astype(int)
df_new['strike_rate'] = df_new['strike_rate'].astype(float)
df_new['fours'] = df_new['fours'].astype(int)
df_new['sixes'] = df_new['sixes'].astype(int)

In [12]:
df_new

Unnamed: 0,runs_scored,balls_faced,strike_rate,fours,sixes,opposition,ground,date,year,not_out
0,0,1,0.00,0,0,Bangladesh,Chattogram,2004-12-23,2004,0
1,12,11,109.09,2,0,Bangladesh,Dhaka,2004-12-26,2004,0
2,7,2,350.00,0,1,Bangladesh,Dhaka,2004-12-27,2004,1
3,3,7,42.85,0,0,Pakistan,Kochi,2005-04-02,2005,0
4,148,123,120.32,15,4,Pakistan,Visakhapatnam,2005-04-05,2005,0
...,...,...,...,...,...,...,...,...,...,...
344,28,52,53.84,3,0,Afghanistan,Southampton,2019-06-22,2019,0
345,56,61,91.80,3,2,West Indies,Manchester,2019-06-27,2019,1
346,42,31,135.48,4,1,England,Birmingham,2019-06-30,2019,1
347,35,33,106.06,4,0,Bangladesh,Birmingham,2019-07-02,2019,0


In [None]:
first_match_date = df['date'].dt.date.min().strftime('%B %d, %Y')   # first match
print('First match: ',first_match_date)

last_match_date = df['date'].dt.date.max().strftime('%B %d, %Y')
print('First match: ',last_match_date)

number_of_matches = df.shape[0]     #no. of matches played in career
print('Number of matches played : ',number_of_matches )

number_of_inns = df_new.shape[0]
print('Number of innings played : ',number_of_inns )

not_outs = df_new['not_out'].sum()  # number of not outs in his career
print('Not outs: ',not_outs)

runs_scored = df_new['runs_scored'].sum()   # runs scored in career
print('Runs scored in career : ', runs_scored)

balls_faced = df_new['balls_faced'].sum()   # balls faced in career
print('Number of balls faced : ', balls_faced)

career_sr = (runs_scored / balls_faced)*100     # career strike rate
print('career strike rate : {:0.2f}'.format(career_sr))

career_avg = (runs_scored / (number_of_inns - not_outs))    #career average
print('Career average: {:0.2f}'.format(career_avg))

hundreds = (df_new['runs_scored'] >= 100).sum()
print('Number of hundreds : ' , hundreds)

fifties = ((df_new['runs_scored'] >= 50) & (df_new['runs_scored'] < 100)).sum()
print('NUmber of fifties : ', fifties)

fours = df_new['fours'].sum()   # number of fours in career
print('Number of 4s : ',fours)

sixes = df_new['sixes'].sum()   # number of sixes in career
print('Number of 6s : ',sixes)




First match:  December 23, 2004
First match:  July 09, 2019
Number of matches played :  350
Number of innings played :  297
Not outs:  84
Runs scored in career :  10773
Number of balls faced :  12303
career strike rate : 87.56
Career average: 50.58
Number of hundreds :  10
NUmber of fifties :  73
Number of 4s :  826
Number of 6s :  229
