In [72]:
import numpy as np
import pandas as pd
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [73]:
data = pd.read_csv('matches.csv')

In [74]:
pd.pandas.set_option("display.max_columns", None) #--> So, that all col will be visible in the output.

## Data Cleaning

In [75]:
data.head()

Unnamed: 0,id,season,city,date,batting_team,Pplay T1,pplay twick1,bowling_team,Pplay T2,pplay twick2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,Avg 1st Innings Score,Avg 2nd Innings Score,umpire1,umpire2,umpire3,Pitch Type
0,1,2017,Hyderabad,05-04-2017,Sunrisers Hyderabad,59,1,Royal Challengers Bangalore,54,1,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",185,175,AY Dandekar,NJ Llong,,Batting
1,2,2017,Pune,06-04-2017,Mumbai Indians,61,1,Rising Pune Supergiant,59,1,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,160,148,A Nand Kishore,S Ravi,,Batting & Spinner Friendly
2,3,2017,Rajkot,07-04-2017,Gujarat Lions,52,1,Kolkata Knight Riders,73,0,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,183,170,Nitin Menon,CK Nandan,,Batting
3,4,2017,Indore,08-04-2017,Rising Pune Supergiant,35,1,Kings XI Punjab,56,2,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,160,158,AK Chaudhary,C Shamshuddin,,Batting
4,5,2017,Bangalore,08-04-2017,Royal Challengers Bangalore,41,2,Delhi Daredevils,43,2,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,170,150,,,,Batting


In [76]:
data.columns

Index(['id', 'season', 'city', 'date', 'batting_team', 'Pplay T1',
       'pplay twick1', 'bowling_team', 'Pplay T2', 'pplay twick2',
       'toss_winner', 'toss_decision', 'result', 'dl_applied', 'winner',
       'win_by_runs', 'win_by_wickets', 'player_of_match', 'venue',
       'Avg 1st Innings Score', 'Avg 2nd Innings Score', 'umpire1', 'umpire2',
       'umpire3', 'Pitch Type'],
      dtype='object')

In [77]:
drop_col = ['id', 'city', 'season', 'date','result', 'dl_applied', 'winner', 'win_by_runs','win_by_wickets', 'player_of_match','Avg 1st Innings Score', 'Avg 2nd Innings Score', 'umpire1', 'umpire2', 'umpire3']

In [78]:
data.drop(drop_col, axis = 1, inplace = True)


In [79]:
data['batting_team'].unique()

array(['Sunrisers Hyderabad', 'Mumbai Indians', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Royal Challengers Bangalore',
       'Kolkata Knight Riders', 'Delhi Daredevils', 'Kings XI Punjab',
       'Chennai Super Kings', 'Rajasthan Royals', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Delhi Capitals'],
      dtype=object)

In [80]:
team_one = dict()
for i in data['batting_team'].unique():
    team_one[i] = i
team_one["Delhi Daredevils"] = "Delhi Capitals"
team_one['Kings XI Punjab'] = 'Punjab Kings'
team_one

team_two = dict()
for i in data['bowling_team'].unique():
    team_two[i] = i
team_two["Delhi Daredevils"] = "Delhi Capitals"
team_two['Kings XI Punjab'] = 'Punjab Kings'
team_two

{'Royal Challengers Bangalore': 'Royal Challengers Bangalore',
 'Rising Pune Supergiant': 'Rising Pune Supergiant',
 'Kolkata Knight Riders': 'Kolkata Knight Riders',
 'Kings XI Punjab': 'Punjab Kings',
 'Delhi Daredevils': 'Delhi Capitals',
 'Sunrisers Hyderabad': 'Sunrisers Hyderabad',
 'Mumbai Indians': 'Mumbai Indians',
 'Gujarat Lions': 'Gujarat Lions',
 'Rajasthan Royals': 'Rajasthan Royals',
 'Chennai Super Kings': 'Chennai Super Kings',
 'Deccan Chargers': 'Deccan Chargers',
 'Pune Warriors': 'Pune Warriors',
 'Kochi Tuskers Kerala': 'Kochi Tuskers Kerala',
 'Delhi Capitals': 'Delhi Capitals'}

In [81]:
data['batting_team'] = data['batting_team'].map(team_one)
data["batting_team"].unique()

data['bowling_team'] = data['bowling_team'].map(team_two)
data["bowling_team"].unique()

data['toss_winner'] = data['toss_winner'].map(team_one)

In [82]:
data.isnull().sum()

batting_team     0
Pplay T1         0
pplay twick1     0
bowling_team     0
Pplay T2         0
pplay twick2     0
toss_winner      0
toss_decision    0
venue            0
Pitch Type       0
dtype: int64

In [83]:
data.dropna(inplace=True)

In [84]:
data.isnull().sum()

batting_team     0
Pplay T1         0
pplay twick1     0
bowling_team     0
Pplay T2         0
pplay twick2     0
toss_winner      0
toss_decision    0
venue            0
Pitch Type       0
dtype: int64

## Preparing data for ML model

In [85]:
toss_decision = {'field':0, 'bat':1}
data['toss_decision'] = data['toss_decision'].map(toss_decision)

In [86]:
pitch_type = {'Batting & Spinner Friendly':0, 'Both':1, 'Batting':2, 'Bowling':3}
data['Pitch Type'] = data['Pitch Type'].map(pitch_type)

In [87]:
data.rename(columns={'Pplay T1':'Pplay1_runs', 'Pplay T2':'Pplay2_runs'}, inplace=True)

In [88]:
data['venue'].unique()

array(['Rajiv Gandhi International Stadium, Uppal',
       'Maharashtra Cricket Association Stadium',
       'Saurashtra Cricket Association Stadium', 'Holkar Cricket Stadium',
       'M Chinnaswamy Stadium', 'Wankhede Stadium', 'Eden Gardens',
       'Feroz Shah Kotla',
       'Punjab Cricket Association IS Bindra Stadium, Mohali',
       'Green Park', 'Punjab Cricket Association Stadium, Mohali',
       'Sawai Mansingh Stadium', 'MA Chidambaram Stadium, Chepauk',
       'Dr DY Patil Sports Academy', 'Newlands', "St George's Park",
       'Kingsmead', 'SuperSport Park', 'Buffalo Park',
       'New Wanderers Stadium', 'De Beers Diamond Oval',
       'OUTsurance Oval', 'Brabourne Stadium',
       'Sardar Patel Stadium, Motera', 'Barabati Stadium',
       'Vidarbha Cricket Association Stadium, Jamtha',
       'Himachal Pradesh Cricket Association Stadium', 'Nehru Stadium',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Subrata Roy Sahara Stadium',
       'Shaheed V

In [89]:
venue = dict()
for i in data['venue'].unique():
    if ',' not in i:
        venue[i] = i
    else:
        a = i.split(",")
        b = a[0]
        venue[i] = b
data['venue'] = data['venue'].map(venue)

In [90]:
data.head()

Unnamed: 0,batting_team,Pplay1_runs,pplay twick1,bowling_team,Pplay2_runs,pplay twick2,toss_winner,toss_decision,venue,Pitch Type
0,Sunrisers Hyderabad,59,1,Royal Challengers Bangalore,54,1,Royal Challengers Bangalore,0,Rajiv Gandhi International Stadium,2
1,Mumbai Indians,61,1,Rising Pune Supergiant,59,1,Rising Pune Supergiant,0,Maharashtra Cricket Association Stadium,0
2,Gujarat Lions,52,1,Kolkata Knight Riders,73,0,Kolkata Knight Riders,0,Saurashtra Cricket Association Stadium,2
3,Rising Pune Supergiant,35,1,Punjab Kings,56,2,Punjab Kings,0,Holkar Cricket Stadium,2
4,Royal Challengers Bangalore,41,2,Delhi Capitals,43,2,Royal Challengers Bangalore,1,M Chinnaswamy Stadium,2


In [91]:
data.to_csv('preprocessed.csv', index = False)

# -----------------------------------------------