In [1]:
# basic python imports
from pathlib import Path
import pickle

# data science libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# scikit learn related imports
from sklearn import preprocessing

In [2]:
# configuration for the code
class config:
    start_date = '2018-01-01'
    columns = [
        'match_id',
        'venue',
        'innings',
        'ball',
        'batting_team',
        'bowling_team',
        'striker',
        'non_striker',
        'bowler',
        'runs_off_bat',
        'extras'
    ]

In [3]:
# path objects for easier reference in code
path = Path('../data/raw/')
data_path = Path('../data/raw/all_matches.csv')
dest_path = Path('../data/processed/')
pkl_path = Path('../bin/objs')

In [4]:
# loading data
df = pd.read_csv(data_path)
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,...,1,,,,1.0,,,,,
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,1,1.0,,,,,,,,
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,


In [5]:
# converting match_id to datetime and then considering rows after config.start_date
df['start_date'] = pd.to_datetime(df['start_date'])
df = df[df['start_date']>=np.datetime64(config.start_date)].reset_index(drop=True)
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1175356,2019,2019-03-23,MA Chidambaram Stadium,1,0.1,Royal Challengers Bangalore,Chennai Super Kings,V Kohli,PA Patel,...,0,,,,,,,,,
1,1175356,2019,2019-03-23,MA Chidambaram Stadium,1,0.2,Royal Challengers Bangalore,Chennai Super Kings,PA Patel,V Kohli,...,0,,,,,,,,,
2,1175356,2019,2019-03-23,MA Chidambaram Stadium,1,0.3,Royal Challengers Bangalore,Chennai Super Kings,PA Patel,V Kohli,...,0,,,,,,,,,
3,1175356,2019,2019-03-23,MA Chidambaram Stadium,1,0.4,Royal Challengers Bangalore,Chennai Super Kings,PA Patel,V Kohli,...,0,,,,,,,,,
4,1175356,2019,2019-03-23,MA Chidambaram Stadium,1,0.5,Royal Challengers Bangalore,Chennai Super Kings,PA Patel,V Kohli,...,0,,,,,,,,,


In [6]:
# label encoding
venue_le = preprocessing.LabelEncoder()
team_le = preprocessing.LabelEncoder()
player_le = preprocessing.LabelEncoder()

# fitting label encoders
venue_le.fit(df['venue'].unique())
team_le.fit(df['batting_team'].unique())
player_le.fit(df['striker'].unique().tolist() + df['non_striker'].unique().tolist() + df['bowler'].unique().tolist())

LabelEncoder()

In [7]:
with open(pkl_path/'venue_le.pkl', 'wb') as f:
    pickle.dump(venue_le, f)
    
with open(pkl_path/'team_le.pkl', 'wb') as f:
    pickle.dump(team_le, f)

In [8]:
# taking only six overs and relavent columns now
df_sixovers = df[df['ball']<=5.6].reset_index(drop=True).loc[:, config.columns]
df_sixovers

Unnamed: 0,match_id,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras
0,1175356,MA Chidambaram Stadium,1,0.1,Royal Challengers Bangalore,Chennai Super Kings,V Kohli,PA Patel,DL Chahar,1,0
1,1175356,MA Chidambaram Stadium,1,0.2,Royal Challengers Bangalore,Chennai Super Kings,PA Patel,V Kohli,DL Chahar,0,0
2,1175356,MA Chidambaram Stadium,1,0.3,Royal Challengers Bangalore,Chennai Super Kings,PA Patel,V Kohli,DL Chahar,0,0
3,1175356,MA Chidambaram Stadium,1,0.4,Royal Challengers Bangalore,Chennai Super Kings,PA Patel,V Kohli,DL Chahar,0,0
4,1175356,MA Chidambaram Stadium,1,0.5,Royal Challengers Bangalore,Chennai Super Kings,PA Patel,V Kohli,DL Chahar,4,0
...,...,...,...,...,...,...,...,...,...,...,...
13564,1254060,"MA Chidambaram Stadium, Chepauk, Chennai",2,5.2,Sunrisers Hyderabad,Kolkata Knight Riders,JM Bairstow,MK Pandey,PJ Cummins,0,0
13565,1254060,"MA Chidambaram Stadium, Chepauk, Chennai",2,5.3,Sunrisers Hyderabad,Kolkata Knight Riders,JM Bairstow,MK Pandey,PJ Cummins,1,0
13566,1254060,"MA Chidambaram Stadium, Chepauk, Chennai",2,5.4,Sunrisers Hyderabad,Kolkata Knight Riders,MK Pandey,JM Bairstow,PJ Cummins,0,0
13567,1254060,"MA Chidambaram Stadium, Chepauk, Chennai",2,5.5,Sunrisers Hyderabad,Kolkata Knight Riders,MK Pandey,JM Bairstow,PJ Cummins,0,0


In [9]:
# declaring dict to store new data that will be used for training
new_data = dict()
new_data['venue'] = []
new_data['batting_team'] = []
new_data['bowling_team'] = []
new_data['innings'] = []
new_data['score'] = []

In [10]:
# consider each match and make new df
for match_id, match_df in df_sixovers.groupby('match_id'):
    for teams, team_df in match_df.groupby(['batting_team', 'bowling_team']):
        new_data['venue'].append(team_df['venue'].values[0])
        new_data['batting_team'].append(teams[0])
        new_data['bowling_team'].append(teams[1])
        new_data['innings'].append(team_df['innings'].values[0])
        new_data['score'].append(sum(team_df['runs_off_bat'].values) + sum(team_df['extras'].values))

In [11]:
# declaring clean data csv
clean_df = pd.DataFrame(new_data)
clean_df.head()

Unnamed: 0,venue,batting_team,bowling_team,innings,score
0,Wankhede Stadium,Chennai Super Kings,Mumbai Indians,2,42
1,Wankhede Stadium,Mumbai Indians,Chennai Super Kings,1,39
2,Punjab Cricket Association IS Bindra Stadium,Delhi Daredevils,Kings XI Punjab,1,45
3,Punjab Cricket Association IS Bindra Stadium,Kings XI Punjab,Delhi Daredevils,2,73
4,Eden Gardens,Kolkata Knight Riders,Royal Challengers Bangalore,2,67


In [12]:
# saving clean data
clean_df.to_csv(dest_path/'clean_data.csv', index=False)

In [13]:
# declaring training dataframe
train_df = clean_df
train_df['venue'] = venue_le.transform(train_df['venue'])
train_df['batting_team'] = team_le.transform(train_df['batting_team'])
train_df['bowling_team'] = team_le.transform(train_df['bowling_team'])
train_df.head()

Unnamed: 0,venue,batting_team,bowling_team,innings,score
0,14,0,5,2,42
1,14,5,0,1,39
2,9,2,3,1,45
3,9,3,2,2,73
4,3,4,7,2,67


In [14]:
# saving training dataframe
train_df.to_csv(dest_path/'train.csv', index=False)