In [1]:
# basic python imports
from pathlib import Path
import pickle

# data science libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# scikit learn related imports
from sklearn import preprocessing

In [2]:
# configuration for the code
class config:
    start_date = '2017-01-01'
    columns = [
        'match_id',
        'venue',
        'innings',
        'ball',
        'batting_team',
        'bowling_team',
        'striker',
        'non_striker',
        'bowler',
        'runs_off_bat',
        'extras'
    ]

In [3]:
# path objects for easier reference in code
path = Path('../data/raw/')
data_path = Path('../data/raw/all_matches.csv')
dest_path = Path('../data/processed/')
pkl_path = Path('../bin/objs')

In [4]:
# loading data
df = pd.read_csv(data_path)
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,...,1,,,,1.0,,,,,
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,1,1.0,,,,,,,,
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,


In [5]:
# correcting redundant stadium names
corrected_venue = []

for venue in df['venue']:
    if venue == 'M Chinnaswamy Stadium':
        corrected_venue.append('M.Chinnaswamy Stadium')
    elif venue == 'Punjab Cricket Association IS Bindra Stadium':
        corrected_venue.append('Punjab Cricket Association IS Bindra Stadium, Mohali')
    elif venue == 'MA Chidambaram Stadium':
        corrected_venue.append('MA Chidambaram Stadium, Chepauk, Chennai')
    elif venue == 'Wankhede Stadium':
        corrected_venue.append('Wankhede Stadium, Mumbai')
    elif venue == 'Rajiv Gandhi International Stadium':
        corrected_venue.append('Rajiv Gandhi International Stadium, Uppal')
    else:
        corrected_venue.append(venue)
        
df['venue'] = corrected_venue

In [6]:
# converting match_id to datetime and then considering rows after config.start_date
df['start_date'] = pd.to_datetime(df['start_date'])
df = df[df['start_date']>=np.datetime64(config.start_date)].reset_index(drop=True)
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1082591,2017,2017-04-05,"Rajiv Gandhi International Stadium, Uppal",1,0.1,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,...,0,,,,,,,,,
1,1082591,2017,2017-04-05,"Rajiv Gandhi International Stadium, Uppal",1,0.2,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,...,0,,,,,,,,,
2,1082591,2017,2017-04-05,"Rajiv Gandhi International Stadium, Uppal",1,0.3,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,...,0,,,,,,,,,
3,1082591,2017,2017-04-05,"Rajiv Gandhi International Stadium, Uppal",1,0.4,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,...,0,,,,,,,,,
4,1082591,2017,2017-04-05,"Rajiv Gandhi International Stadium, Uppal",1,0.5,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,...,2,2.0,,,,,,,,


In [7]:
# label encoding
venue_le = preprocessing.LabelEncoder()
team_le = preprocessing.LabelEncoder()
player_le = preprocessing.LabelEncoder()

# fitting label encoders
venue_le.fit(df['venue'].unique())
team_le.fit(df['batting_team'].unique())
player_le.fit(df['striker'].unique().tolist() + df['non_striker'].unique().tolist() + df['bowler'].unique().tolist())

LabelEncoder()

In [8]:
# storing label encoder pickle objects
with open(pkl_path/'venue_le.pkl', 'wb') as f:
    pickle.dump(venue_le, f)
    
with open(pkl_path/'team_le.pkl', 'wb') as f:
    pickle.dump(team_le, f)

In [9]:
# taking only six overs and relavent columns now
df_sixovers = df[df['ball']<6.0].reset_index(drop=True).loc[:, config.columns]
df_sixovers

Unnamed: 0,match_id,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras
0,1082591,"Rajiv Gandhi International Stadium, Uppal",1,0.1,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,TS Mills,0,0
1,1082591,"Rajiv Gandhi International Stadium, Uppal",1,0.2,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,TS Mills,0,0
2,1082591,"Rajiv Gandhi International Stadium, Uppal",1,0.3,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,TS Mills,4,0
3,1082591,"Rajiv Gandhi International Stadium, Uppal",1,0.4,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,TS Mills,0,0
4,1082591,"Rajiv Gandhi International Stadium, Uppal",1,0.5,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,TS Mills,0,2
...,...,...,...,...,...,...,...,...,...,...,...
18033,1254060,"MA Chidambaram Stadium, Chepauk, Chennai",2,5.2,Sunrisers Hyderabad,Kolkata Knight Riders,JM Bairstow,MK Pandey,PJ Cummins,0,0
18034,1254060,"MA Chidambaram Stadium, Chepauk, Chennai",2,5.3,Sunrisers Hyderabad,Kolkata Knight Riders,JM Bairstow,MK Pandey,PJ Cummins,1,0
18035,1254060,"MA Chidambaram Stadium, Chepauk, Chennai",2,5.4,Sunrisers Hyderabad,Kolkata Knight Riders,MK Pandey,JM Bairstow,PJ Cummins,0,0
18036,1254060,"MA Chidambaram Stadium, Chepauk, Chennai",2,5.5,Sunrisers Hyderabad,Kolkata Knight Riders,MK Pandey,JM Bairstow,PJ Cummins,0,0


In [10]:
# declaring dict to store new data that will be used for training
new_data = dict()
new_data['venue'] = []
new_data['batting_team'] = []
new_data['bowling_team'] = []
new_data['innings'] = []
new_data['score'] = []

In [11]:
# consider each match and make new df
for match_id, match_df in df_sixovers.groupby('match_id'):
    for teams, team_df in match_df.groupby(['batting_team', 'bowling_team']):
        new_data['venue'].append(team_df['venue'].values[0])
        new_data['batting_team'].append(teams[0])
        new_data['bowling_team'].append(teams[1])
        new_data['innings'].append(team_df['innings'].values[0])
        new_data['score'].append(sum(team_df['runs_off_bat'].values) + sum(team_df['extras'].values))

In [12]:
# declaring clean data csv
clean_df = pd.DataFrame(new_data)
clean_df.head()

Unnamed: 0,venue,batting_team,bowling_team,innings,score
0,"Rajiv Gandhi International Stadium, Uppal",Royal Challengers Bangalore,Sunrisers Hyderabad,2,54
1,"Rajiv Gandhi International Stadium, Uppal",Sunrisers Hyderabad,Royal Challengers Bangalore,1,59
2,Maharashtra Cricket Association Stadium,Mumbai Indians,Rising Pune Supergiant,1,61
3,Maharashtra Cricket Association Stadium,Rising Pune Supergiant,Mumbai Indians,2,59
4,Saurashtra Cricket Association Stadium,Gujarat Lions,Kolkata Knight Riders,1,52


In [13]:
# saving clean data
clean_df.to_csv(dest_path/'clean_data.csv', index=False)

In [14]:
# declaring training dataframe
train_df = clean_df
train_df['venue'] = venue_le.transform(train_df['venue'])
train_df['batting_team'] = team_le.transform(train_df['batting_team'])
train_df['bowling_team'] = team_le.transform(train_df['bowling_team'])
train_df.head()

Unnamed: 0,venue,batting_team,bowling_team,innings,score
0,11,9,10,2,54
1,11,10,9,1,59
2,9,6,8,1,61
3,9,8,6,2,59
4,12,3,5,1,52


In [15]:
# saving training dataframe
train_df.to_csv(dest_path/'train.csv', index=False)