In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import folium #Matches visualization
import yaml #reading data
from datetime import datetime
import glob

###  AB de Villiers batting Analysis
#### Data obtained from :  https://cricsheet.org/downloads/royal_challengers_bangalore.zip
#### Ball to Ball data of every game played by RCB
#### 5k series onwards - IPL 2011, pick data from there

##### STEP 1: Data cleaning

In [41]:
All_ABD_Balls = []


our_team = 'Royal Challengers Bangalore'
root_dir = os.getcwd() + '\\royal_challengers_bangalore'
for file in glob.iglob(root_dir +'//*.yaml',recursive = True):
    

    with open(file, 'r') as stream:
        try:
            match_dict = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

    
    ################## TRAVERSING THE RIGHT KEY BASED ON TOSS###########################
    if match_dict['info']['toss']['winner'] == our_team: #RCB won the toss
        if match_dict['info']['toss']['decision'] == 'field':
            
            #RCB won the toss and fielded - RCB bat second
            innings_deliveries = match_dict['innings'][1]['2nd innings']['deliveries']
            
        else:
            #RCB won the toss and batted - RCB bat first
            innings_deliveries = match_dict['innings'][0]['1st innings']['deliveries']

    else:
        #RCB has lost the toss
        if match_dict['info']['toss']['decision'] == 'field':
            
            #RCB lost the toss and opponent decided to field - RCB bat first
            innings_deliveries = match_dict['innings'][0]['1st innings']['deliveries']
        else:
            #RCB has lost the toss and opponent decided to bat - RCB bat second
            innings_deliveries = match_dict['innings'][1]['2nd innings']['deliveries']
            
    ########################## GETTING VENUE OF MATCH###################################
    try:
        
        match_city = match_dict['info']['city']
    except:
        match_city = match_dict['info']['venue']
        
    
        
    teams = match_dict['info']['teams']
    
    
    ########################### MATCH DATE ############################################
    if type(match_dict['info']['dates'][0]) == str: 
        year = match_dict['info']['dates'][0] #[0:4]
    else:
        year = match_dict['info']['dates'][0].strftime("%d-%b-%Y") #[-4:]
        
    ########################## GETTING OPPOSITION NAME #################################
    opposition = [team for team in teams if team != our_team][0]
    
    ########################## RUNNING THROUGH EACH BALL ###############################

    #Pick the name of the batsman, delivery
    #Iterating though every delivery - every ball a dict
    for ball in innings_deliveries:

        for item in ball.items():
            ######### ISOLATING ABD ##################
            if item[1]['batsman'] == 'AB de Villiers':
                #print(item)
                over = item[0]
                #batsman = item[1]['batsman']
                bowler = item[1]['bowler']
                non_striker = item[1]['non_striker']
                runs_scored_by_batsman = item[1]['runs']['batsman']
                extras = item[1]['runs']['extras']
                if extras > 0:
                    extra_type = list(item[1]['extras'].keys())[0]
                else:
                    extra_type = 'legal'

                All_ABD_Balls.append([over, \
                                      bowler, \
                                      non_striker, \
                                      runs_scored_by_batsman, \
                                      extras,\
                                      extra_type, \
                                      opposition, \
                                      match_city, \
                                      year])


In [43]:
match_dict

{'meta': {'data_version': 0.9,
  'created': datetime.date(2016, 5, 30),
  'revision': 1},
 'info': {'city': 'Bangalore',
  'competition': 'IPL',
  'dates': [datetime.date(2016, 5, 29)],
  'gender': 'male',
  'match_type': 'T20',
  'outcome': {'by': {'runs': 8}, 'winner': 'Sunrisers Hyderabad'},
  'overs': 20,
  'player_of_match': ['BCJ Cutting'],
  'teams': ['Royal Challengers Bangalore', 'Sunrisers Hyderabad'],
  'toss': {'decision': 'bat', 'winner': 'Sunrisers Hyderabad'},
  'umpires': ['HDPK Dharmasena', 'BNJ Oxenford'],
  'venue': 'M Chinnaswamy Stadium'},
 'innings': [{'1st innings': {'team': 'Sunrisers Hyderabad',
    'deliveries': [{0.1: {'batsman': 'DA Warner',
       'bowler': 'S Aravind',
       'non_striker': 'S Dhawan',
       'runs': {'batsman': 1, 'extras': 0, 'total': 1}}},
     {0.2: {'batsman': 'S Dhawan',
       'bowler': 'S Aravind',
       'non_striker': 'DA Warner',
       'runs': {'batsman': 1, 'extras': 0, 'total': 1}}},
     {0.3: {'batsman': 'DA Warner',
      

In [38]:
#Create dataframe using abd all balls
columns = ['Over', \
           'Bowler', \
           'Non_Striker',\
           'Runs', \
           'Extras', \
           'Extra Type', \
           'Opposition', \
           'Venue', \
           'Year']
df = pd.DataFrame(data = All_ABD_Balls, \
                  columns = columns)
df.head()

Unnamed: 0,Over,Bowler,Non_Striker,Runs,Extras,Extra Type,Opposition,Venue,Year
0,1.5,Sandeep Sharma,Vishnu Vinod,4,0,legal,Kings XI Punjab,Indore,10-Apr-2017
1,1.6,Sandeep Sharma,Vishnu Vinod,0,0,legal,Kings XI Punjab,Indore,10-Apr-2017
2,2.4,MM Sharma,Vishnu Vinod,0,0,legal,Kings XI Punjab,Indore,10-Apr-2017
3,2.5,MM Sharma,Vishnu Vinod,0,0,legal,Kings XI Punjab,Indore,10-Apr-2017
4,2.6,MM Sharma,Vishnu Vinod,6,0,legal,Kings XI Punjab,Indore,10-Apr-2017


In [None]:
#Changing team names
old_names = ['Deccan Chargers', 'Kings XI Punjab', 'Rising Pune Supergiant', 'Delhi Daredevils']
new_names = ['Sunrisers Hyderabad', 'Punjab Kings', 'Rising Pune Supergiants', 'Delhi Capitals']
for old, new in zip(old_names, new_names):
    df.loc[df['Opposition'] == old, 'Opposition'] = new


In [None]:
df.info()


##### Some Exploratory Analysis
##### Q1. How many runs has AB scored in each year of the IPL over the full innings

In [None]:
fig, ax = plt.subplots(figsize = (40,10))
sns.countplot(x = sorted(df['Year']), ax = ax)
ax.tick_params(axis = 'x', labelrotation = 90, labelsize = 20)
#JOB: sort the date strings, plot does not make sense

##### Q2. Runs scored by AB from 2011 - 2021 against each team


In [None]:
by_opp = df.groupby('Opposition').sum()

fig, ax = plt.subplots(figsize = (10,5))
sns.barplot(x = "Runs", y = "Opposition", data = df, estimator = sum, ci = None)
ax.tick_params(axis = 'y', labelsize = 12)
ax.tick_params(axis = 'x', labelsize = 12)
ax.set_xlabel('Runs', fontsize = 12);
ax.set_ylabel('Opposition', fontsize = 12);
#JOB Merge teams like SRH DC, RPS, DC and DD

##### Q3. Boundary count per team;  sixes and four



In [None]:
filter_df = df[['Opposition', 'Runs']]
run_count = []
for opp in filter_df['Opposition'].unique():
    all_run_count = filter_df[filter_df['Opposition'] == opp].groupby('Runs').count()   
    #Count run counts
    #0,1,2,3,4,6
    run_count.append([all_run_count.iloc[i][0] for i in range(len(all_run_count))])
#JOB: CHECK IF THERE IS A BETTER WAY

fig, ax = plt.subplots(figsize = (15,5), nrows = 1, ncols = 2)
four_count = [opp[-2] for opp in run_count]
sns.barplot(y = df['Opposition'].unique(), x = six_count, ax = ax[0])
ax[0].tick_params(axis = 'x', labelsize = 12)
ax[0].tick_params(axis = 'y', labelsize = 12)
ax[0].set_title('Number of Fours by AB de Villiers', fontsize = 12)
ax[0].set_xlim([0, 60])

six_count = [opp[-1] for opp in run_count]
sns.barplot(y = df['Opposition'].unique(), x = six_count, ax = ax[1])
ax[1].tick_params(axis = 'x', labelsize = 12)
ax[1].set_title('Number of Sixes by AB de Villiers', fontsize = 12)
ax[1].set_yticks([])
ax[1].set_xlim([0, 60])
#JOB: The counts for RPS and PW are wrong, not all of them have boundaries

####                                                       More Exploraratory questions to ask
##### Q4.  Number of dots, ones twos and threes per team
##### Q5. percentage of runs per type of bowler - need manual labelling of bowler or data scraping
##### Q6. Death overs 16 - 20; percentage of runs and consistency
##### Q7. Percentage of dismissala per bowler
##### Q8. What is AB's strength and weakness per venue - bowler type, if you get line and length then include
##### Q9. How would you quantify bowler's performance

####                                                                   Visualization Questions
##### 1. In the map of india/UAE, where does AB have the most runs
##### 2. Is there significant difference between AB's strike rate when an incompetent non striker is at the other end


In [None]:
#exploratory solutions

## Question
### What influences AB's run (Boundaries or not boundaries)
1. Dot ball - excellent line and length, bowling to the field inside the thirty yard circle, excelent fielding in gaps, batting with taillenders
2. Ones, twos - hit into the gaps but not timed well (including edges), batting with taileenders and end of over (single)
3. fours, sixes - bowler slot ball, short boundary, timed gaps, death overs needing runs

#### Can we predict the type of run (dots, non boundaries, boundaries) from 
##### 1. Bowler type
##### 2. bowler dot percent (0 - 1) - higher dot percent indicative of good line and length or bowling to inner field or good fielding or batting with tailenders
##### 3. 

### FEATURE ANALYSIS
### https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/
### "Statistical-based feature selection methods involve evaluating the relationship between each input variable and the target variable using statistics and selecting those input variables that have the strongest relationship with the target variable"

### "The statistical measures used in filter-based feature selection are generally calculated one input variable at a time with the target variable"

### For a regression modelling probem - a correlation like pearson for linear and spearman rank for non linear is used

### For Classification problem  ANOVA correlation cefficient for linear or kendall's rank coefficient for non linear is used

### 

## JOBS TODAY:
### Scrape non striker data


In [None]:
import json
with open('bowler_stats.json') as f:
    bowler_stat = json.load(f)
    

In [10]:
df['Non_Striker'].unique()

array(['Vishnu Vinod', 'KM Jadhav', 'Mandeep Singh', 'STR Binny',
       'V Kohli', 'CH Gayle', 'TM Head', 'Q de Kock', 'CJ Anderson',
       'PA Patel', 'MM Ali', 'SN Khan', 'C de Grandhomme', 'SO Hetmyer',
       'S Dube', 'MP Stoinis', 'AD Nath', 'D Padikkal', 'JR Philippe',
       'Washington Sundar', 'AJ Finch', 'Gurkeerat Singh', 'NA Saini',
       'GJ Maxwell', 'Shahbaz Ahmed', 'DT Christian', 'KA Jamieson',
       'HV Patel', 'RM Patidar', 'DR Sams', 'MA Agarwal', 'SS Tiwary',
       'AUK Pathan', 'TM Dilshan', 'CA Pujara', 'JJ van der Wath',
       'M Kaif', 'A Mithun', 'KB Arun Karthik', 'LA Pomersbach',
       'DL Vettori', 'R Vinay Kumar', 'AB McDonald', 'J Syed Mohammad',
       'R Rampaul', 'MC Henriques', 'Yuvraj Singh', 'JA Morkel',
       'RR Rossouw', 'MA Starc', 'S Rana', 'VH Zol', 'DJG Sammy',
       'SA Abbott', 'KD Karthik', 'D Wiese', 'SR Watson', 'KL Rahul',
       'Sachin Baby', 'Iqbal Abdulla'], dtype=object)

In [39]:
df.to_csv('ABD_ballToball.csv')