In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('data/ipl.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76014 entries, 0 to 76013
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mid             76014 non-null  int64  
 1   date            76014 non-null  object 
 2   venue           76014 non-null  object 
 3   bat_team        76014 non-null  object 
 4   bowl_team       76014 non-null  object 
 5   batsman         76014 non-null  object 
 6   bowler          76014 non-null  object 
 7   runs            76014 non-null  int64  
 8   wickets         76014 non-null  int64  
 9   overs           76014 non-null  float64
 10  runs_last_5     76014 non-null  int64  
 11  wickets_last_5  76014 non-null  int64  
 12  striker         76014 non-null  int64  
 13  non-striker     76014 non-null  int64  
 14  total           76014 non-null  int64  
dtypes: float64(1), int64(8), object(6)
memory usage: 8.7+ MB


In [4]:
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [5]:
columns_to_remove = ['mid','date','bat_team','bowl_team','batsman', 'bowler', 'striker', 'non-striker']
df.drop(labels=columns_to_remove, axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,venue,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,M Chinnaswamy Stadium,1,0,0.1,1,0,222
1,M Chinnaswamy Stadium,1,0,0.2,1,0,222
2,M Chinnaswamy Stadium,2,0,0.2,2,0,222
3,M Chinnaswamy Stadium,2,0,0.3,2,0,222
4,M Chinnaswamy Stadium,2,0,0.4,2,0,222


In [7]:
df.isnull().sum()

venue             0
runs              0
wickets           0
overs             0
runs_last_5       0
wickets_last_5    0
total             0
dtype: int64

In [8]:
df.shape

(76014, 7)

In [9]:
df['venue'].unique()

array(['M Chinnaswamy Stadium',
       'Punjab Cricket Association Stadium, Mohali', 'Feroz Shah Kotla',
       'Wankhede Stadium', 'Eden Gardens', 'Sawai Mansingh Stadium',
       'Rajiv Gandhi International Stadium, Uppal',
       'MA Chidambaram Stadium, Chepauk', 'Dr DY Patil Sports Academy',
       'Newlands', "St George's Park", 'Kingsmead', 'SuperSport Park',
       'Buffalo Park', 'New Wanderers Stadium', 'De Beers Diamond Oval',
       'OUTsurance Oval', 'Brabourne Stadium',
       'Sardar Patel Stadium, Motera', 'Barabati Stadium',
       'Vidarbha Cricket Association Stadium, Jamtha',
       'Himachal Pradesh Cricket Association Stadium', 'Nehru Stadium',
       'Holkar Cricket Stadium',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Subrata Roy Sahara Stadium',
       'Shaheed Veer Narayan Singh International Stadium',
       'JSCA International Stadium Complex', 'Sheikh Zayed Stadium',
       'Sharjah Cricket Stadium', 'Dubai International Cricket St

In [10]:
#Keeping only indian stadiums  

foreign_stadiums = ['Newlands', "St George's Park", 'Kingsmead', 'SuperSport Park','Buffalo Park', 'New Wanderers Stadium', 'De Beers Diamond Oval',
       'OUTsurance Oval','Sheikh Zayed Stadium','Sharjah Cricket Stadium', 'Dubai International Cricket Stadium']
indian_stadiums = [i for i in df['venue'].unique() if i not in foreign_stadiums]

In [11]:
indian_stadiums

['M Chinnaswamy Stadium',
 'Punjab Cricket Association Stadium, Mohali',
 'Feroz Shah Kotla',
 'Wankhede Stadium',
 'Eden Gardens',
 'Sawai Mansingh Stadium',
 'Rajiv Gandhi International Stadium, Uppal',
 'MA Chidambaram Stadium, Chepauk',
 'Dr DY Patil Sports Academy',
 'Brabourne Stadium',
 'Sardar Patel Stadium, Motera',
 'Barabati Stadium',
 'Vidarbha Cricket Association Stadium, Jamtha',
 'Himachal Pradesh Cricket Association Stadium',
 'Nehru Stadium',
 'Holkar Cricket Stadium',
 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
 'Subrata Roy Sahara Stadium',
 'Shaheed Veer Narayan Singh International Stadium',
 'JSCA International Stadium Complex',
 'Maharashtra Cricket Association Stadium',
 'Punjab Cricket Association IS Bindra Stadium, Mohali',
 'Saurashtra Cricket Association Stadium',
 'Green Park']

In [12]:
df = df[(df['venue'].isin(indian_stadiums))]

In [13]:
df

Unnamed: 0,venue,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,M Chinnaswamy Stadium,1,0,0.1,1,0,222
1,M Chinnaswamy Stadium,1,0,0.2,1,0,222
2,M Chinnaswamy Stadium,2,0,0.2,2,0,222
3,M Chinnaswamy Stadium,2,0,0.3,2,0,222
4,M Chinnaswamy Stadium,2,0,0.4,2,0,222
...,...,...,...,...,...,...,...
76009,"Rajiv Gandhi International Stadium, Uppal",121,7,19.2,40,0,129
76010,"Rajiv Gandhi International Stadium, Uppal",127,7,19.3,46,0,129
76011,"Rajiv Gandhi International Stadium, Uppal",128,7,19.4,47,0,129
76012,"Rajiv Gandhi International Stadium, Uppal",129,7,19.5,48,0,129


In [14]:
label_encoder = LabelEncoder()

In [15]:
df['venue'] = label_encoder.fit_transform(df['venue'])

In [16]:
df['venue'].unique()

array([10, 15,  5, 23,  4, 19, 16, 11,  2,  1, 17,  0, 22,  7, 13,  8,  3,
       21, 20,  9, 12, 14, 18,  6])

In [17]:
df.head()

Unnamed: 0,venue,runs,wickets,overs,runs_last_5,wickets_last_5,total
0,10,1,0,0.1,1,0,222
1,10,1,0,0.2,1,0,222
2,10,2,0,0.2,2,0,222
3,10,2,0,0.3,2,0,222
4,10,2,0,0.4,2,0,222


In [18]:
df.corr()

Unnamed: 0,venue,runs,wickets,overs,runs_last_5,wickets_last_5,total
venue,1.0,-0.016704,-0.001359,0.002588,-0.020816,-0.000383,-0.031839
runs,-0.016704,1.0,0.59247,0.938566,0.752519,0.304245,0.253616
wickets,-0.001359,0.59247,1.0,0.755937,0.246597,0.700419,-0.329097
overs,0.002588,0.938566,0.755937,1.0,0.632521,0.392718,0.023318
runs_last_5,-0.020816,0.752519,0.246597,0.632521,1.0,0.063805,0.366696
wickets_last_5,-0.000383,0.304245,0.700419,0.392718,0.063805,1.0,-0.272767
total,-0.031839,0.253616,-0.329097,0.023318,0.366696,-0.272767,1.0


In [19]:
# Removing the first 5 overs data in every match
df = df[df['overs']>=19.5]

In [20]:
df.shape

(1035, 7)

In [31]:
X = df.drop('total',axis=1)
y = df['total']

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [33]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression()

In [40]:
regressor.predict([[4,100,2,7,40,2]])



array([205.67676342])