In [2]:
# -------------------------------------------------------------------------------------------------------
# File Name: deliveries_data_cleaning.ipynb
# Author:   Jai Verma
# Description: Jupyter Notebook for Cleaning and validating the deliveries dataset
# -------------------------------------------------------------------------------------------------------

# Deliveries Data Cleaning

Objective: 
To clean and process the deliveries.csv data

## Table Of Contents
1. [Importing Libraries](#importing-libraries) 
2. [Data Loading](#loading-data)
3. [Data Pre-Processing](#pre-processing-the-data)
    * [Validating Innings](#searching-for-innings-value-more-than-2)
    * [Batting Teams](#checking-the-names-of-the-batting-teams)
    * [Bowling Teams](#checking-the-names-of-the-bowling-teams)
    * [Making Team names consistent](#changing-some-team-names)
    * [Valdating other columns](#validating-other-columns-similarly-to-innings)
4. [Final Dataframe](#checking-final-dataframe-using-some-filters)
5. [Exporting Dataframe](#exporting-to-csv)

## Importing Libraries

In [15]:
import pandas as pd

## Loading Data

In [16]:
deliveries_url = "https://raw.githubusercontent.com/Jai-Verma-04/IPL-Web-App/refs/heads/main/data/raw/raw_deliveries.csv?token=GHSAT0AAAAAACWLEOM6WILNSZ3BXSRSARTSZYGGIHA"

In [17]:
deliveries = pd.read_csv(deliveries_url)

In [18]:
deliveries_backup = deliveries.copy()

## Pre-Processing the data

In [19]:
deliveries.dtypes

match_id             int64
inning               int64
batting_team        object
bowling_team        object
over                 int64
ball                 int64
batter              object
bowler              object
non_striker         object
batsman_runs         int64
extra_runs           int64
total_runs           int64
extras_type         object
is_wicket            int64
player_dismissed    object
dismissal_kind      object
fielder             object
dtype: object

In [20]:
deliveries.isnull().sum()

match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batter                   0
bowler                   0
non_striker              0
batsman_runs             0
extra_runs               0
total_runs               0
extras_type         246795
is_wicket                0
player_dismissed    247970
dismissal_kind      247970
fielder             251566
dtype: int64

In [21]:
deliveries.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs',
       'total_runs', 'extras_type', 'is_wicket', 'player_dismissed',
       'dismissal_kind', 'fielder'],
      dtype='object')

In [22]:
# more than 2 innings are because of the super overs
deliveries.inning.value_counts()

inning
1    135018
2    125741
3        77
4        72
5         8
6         4
Name: count, dtype: int64

### Searching for Innings value more than 2

In [23]:
deliveries[(deliveries.inning > 2) & (deliveries.batting_team == 'Kolkata Knight Riders')].head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
15417,392190,3,Kolkata Knight Riders,Rajasthan Royals,0,1,CH Gayle,Kamran Khan,BB McCullum,1,0,1,,0,,,
15418,392190,3,Kolkata Knight Riders,Rajasthan Royals,0,2,BB McCullum,Kamran Khan,CH Gayle,1,0,1,,0,,,
15419,392190,3,Kolkata Knight Riders,Rajasthan Royals,0,3,CH Gayle,Kamran Khan,BB McCullum,0,1,1,wides,0,,,
15420,392190,3,Kolkata Knight Riders,Rajasthan Royals,0,4,CH Gayle,Kamran Khan,BB McCullum,4,0,4,,0,,,
15421,392190,3,Kolkata Knight Riders,Rajasthan Royals,0,5,CH Gayle,Kamran Khan,BB McCullum,4,0,4,,0,,,


In [24]:
deliveries[(deliveries.inning > 5)]

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
185380,1216517,6,Kings XI Punjab,Mumbai Indians,0,1,CH Gayle,TA Boult,MA Agarwal,6,0,6,,0,,,
185381,1216517,6,Kings XI Punjab,Mumbai Indians,0,2,CH Gayle,TA Boult,MA Agarwal,1,0,1,,0,,,
185382,1216517,6,Kings XI Punjab,Mumbai Indians,0,3,MA Agarwal,TA Boult,CH Gayle,4,0,4,,0,,,
185383,1216517,6,Kings XI Punjab,Mumbai Indians,0,4,MA Agarwal,TA Boult,CH Gayle,4,0,4,,0,,,


### Checking the names of the batting teams

In [25]:
deliveries.batting_team.value_counts()

batting_team
Mumbai Indians                 31437
Kolkata Knight Riders          29514
Chennai Super Kings            28651
Royal Challengers Bangalore    28205
Rajasthan Royals               26242
Kings XI Punjab                22646
Sunrisers Hyderabad            21843
Delhi Daredevils               18786
Delhi Capitals                 10946
Deccan Chargers                 9034
Punjab Kings                    6833
Gujarat Titans                  5494
Pune Warriors                   5443
Lucknow Super Giants            5400
Gujarat Lions                   3566
Rising Pune Supergiant          1900
Royal Challengers Bengaluru     1818
Kochi Tuskers Kerala            1582
Rising Pune Supergiants         1580
Name: count, dtype: int64

### Checking the names of the bowling teams

In [26]:
deliveries.bowling_team.value_counts()

bowling_team
Mumbai Indians                 31505
Kolkata Knight Riders          29663
Chennai Super Kings            28576
Royal Challengers Bangalore    28358
Rajasthan Royals               26432
Kings XI Punjab                22483
Sunrisers Hyderabad            21717
Delhi Daredevils               18725
Delhi Capitals                 11216
Deccan Chargers                 9039
Punjab Kings                    6719
Pune Warriors                   5457
Gujarat Titans                  5301
Lucknow Super Giants            5226
Gujarat Lions                   3545
Rising Pune Supergiant          1928
Royal Challengers Bengaluru     1801
Rising Pune Supergiants         1615
Kochi Tuskers Kerala            1614
Name: count, dtype: int64

### Changing some team names

Making Bangalore, Delhi and Pune Franchise's  name consistent

In [27]:
deliveries[['batting_team', 'bowling_team']] = deliveries[['batting_team', 'bowling_team']].replace('Rising Pune Supergiant', 'Rising Pune Supergiants')

In [35]:
deliveries[['batting_team', 'bowling_team']] = deliveries[['batting_team', 'bowling_team']].replace('Royal Challengers Bengaluru', 'Royal Challengers Bangalore')

In [36]:
deliveries[['batting_team', 'bowling_team']] = deliveries[['batting_team', 'bowling_team']].replace('Delhi Daredevils', 'Delhi Capitals')

### Validating other columns similarly to innings

In [28]:
deliveries.over.value_counts()

over
0     13906
1     13773
2     13597
3     13575
4     13560
5     13494
6     13452
7     13430
8     13396
9     13354
10    13289
11    13261
12    13222
13    13124
14    13024
15    12879
16    12685
17    12318
18    11583
19     9998
Name: count, dtype: int64

In [29]:
deliveries.extra_runs.unique()

array([1, 0, 5, 4, 2, 3, 7])

In [30]:
deliveries.batsman_runs.unique()

array([0, 4, 6, 1, 2, 5, 3])

In [31]:
deliveries.is_wicket.value_counts()

is_wicket
0    247970
1     12950
Name: count, dtype: int64

In [32]:
deliveries.dismissal_kind.unique()

array([nan, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field', 'retired out'], dtype=object)

### Checking final dataframe using some filters

In [33]:
deliveries[(deliveries.batting_team=='Kolkata Knight Riders') & (deliveries.bowling_team == 'Chennai Super Kings')]

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
2606,335993,1,Kolkata Knight Riders,Chennai Super Kings,0,1,SC Ganguly,JDP Oram,BB McCullum,2,0,2,,0,,,
2607,335993,1,Kolkata Knight Riders,Chennai Super Kings,0,2,SC Ganguly,JDP Oram,BB McCullum,1,0,1,,0,,,
2608,335993,1,Kolkata Knight Riders,Chennai Super Kings,0,3,BB McCullum,JDP Oram,SC Ganguly,0,0,0,,0,,,
2609,335993,1,Kolkata Knight Riders,Chennai Super Kings,0,4,BB McCullum,JDP Oram,SC Ganguly,1,0,1,,0,,,
2610,335993,1,Kolkata Knight Riders,Chennai Super Kings,0,5,SC Ganguly,JDP Oram,BB McCullum,0,0,0,,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249092,1426260,1,Kolkata Knight Riders,Chennai Super Kings,19,2,MA Starc,Mustafizur Rahman,AS Roy,0,0,0,,0,,,
249093,1426260,1,Kolkata Knight Riders,Chennai Super Kings,19,3,MA Starc,Mustafizur Rahman,AS Roy,0,0,0,,0,,,
249094,1426260,1,Kolkata Knight Riders,Chennai Super Kings,19,4,MA Starc,Mustafizur Rahman,AS Roy,0,0,0,,1,MA Starc,caught,R Ravindra
249095,1426260,1,Kolkata Knight Riders,Chennai Super Kings,19,5,VG Arora,Mustafizur Rahman,AS Roy,1,0,1,,0,,,


### Exporting to CSV

In [37]:
deliveries.to_csv(f"..\data\processed\deliveries_processed.csv")

  deliveries.to_csv(f"..\data\processed\deliveries_processed.csv")
