In [1]:
import pandas as pd
import numpy as np

In [5]:
dataset = pd.read_csv('/content/ipl_data.csv')
dataset.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [6]:
irrelevant = ['mid', 'date', 'venue','batsman', 'bowler', 'striker', 'non-striker']

dataset = dataset.drop(irrelevant,axis=1)

dataset.shape

(76014, 8)

In [7]:
const_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
              'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
              'Delhi Daredevils', 'Sunrisers Hyderabad']


dataset = dataset[(dataset['bat_team'].isin(const_teams)) & (dataset['bowl_team'].isin(const_teams))]

dataset.shape

(53811, 8)

In [8]:
dataset = dataset[dataset['overs']>=5.0]
dataset.shape

(40108, 8)

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ['bat_team','bowl_team']:
  dataset[col] = le.fit_transform(dataset[col])
dataset.head()

Unnamed: 0,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
32,3,6,61,0,5.1,59,0,222
33,3,6,61,1,5.2,59,1,222
34,3,6,61,1,5.3,59,1,222
35,3,6,61,1,5.4,59,1,222
36,3,6,61,1,5.5,58,1,222


In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('encoder',OneHotEncoder(),[0,1])],remainder = 'passthrough')

dataset = np.array(ct.fit_transform(dataset))

In [13]:
cols = ['batting_team_Chennai Super Kings', 'batting_team_Delhi Daredevils', 'batting_team_Kings XI Punjab',
              'batting_team_Kolkata Knight Riders', 'batting_team_Mumbai Indians', 'batting_team_Rajasthan Royals',
              'batting_team_Royal Challengers Bangalore', 'batting_team_Sunrisers Hyderabad',
              'bowling_team_Chennai Super Kings', 'bowling_team_Delhi Daredevils', 'bowling_team_Kings XI Punjab',
              'bowling_team_Kolkata Knight Riders', 'bowling_team_Mumbai Indians', 'bowling_team_Rajasthan Royals',
              'bowling_team_Royal Challengers Bangalore', 'bowling_team_Sunrisers Hyderabad', 'runs', 'wickets', 'overs',
       'runs_last_5', 'wickets_last_5', 'total']

dataset = pd.DataFrame(dataset,columns = cols)

dataset.shape

(40108, 22)

In [14]:
features = dataset.drop(['total'],axis = 1)
labels = dataset['total']

In [15]:
from sklearn.model_selection import train_test_split

train_features,test_features,train_labels,test_labels = train_test_split(features,labels,test_size = 0.20,shuffle = True)

train_features.shape
test_features.shape

(8022, 21)

**Linear Regression**

In [16]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_features,train_labels)

In [19]:
lr_score_train = str(lr.score(train_features,train_labels)*100)
lr_score_test = str(lr.score(test_features,test_labels)*100)

print('Train Score = '+lr_score_train[:4]+'%')
print('Test Score = '+lr_score_test[:4]+'%')

Train Score = 65.8%
Test Score = 66.3%


**Decision Tree**

In [20]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree.fit(train_features,train_labels)

In [21]:
tree_score_train = str(tree.score(train_features,train_labels)*100)
tree_score_test = str(tree.score(test_features,test_labels)*100)

print('Train Score = '+tree_score_train[:4]+'%')
print('Test Score = '+tree_score_test[:4]+'%')

Train Score = 99.9%
Test Score = 86.5%


**Random Forest Regression**

In [22]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(train_features,train_labels)

In [23]:
rfr_score_train = str(rfr.score(train_features,train_labels)*100)
rfr_score_test = str(rfr.score(test_features,test_labels)*100)

print('Train Score = '+rfr_score_train[:4]+'%')
print('Test Score = '+rfr_score_test[:4]+'%')

Train Score = 99.0%
Test Score = 93.5%


In [28]:
def predict_score(batting_team, bowling_team, runs, wickets, overs, runs_last_5, wickets_last_5, model=rfr):
  teams = [
        'Chennai Super Kings', 'Delhi Daredevils', 'Kings XI Punjab', 'Kolkata Knight Riders',
        'Mumbai Indians', 'Rajasthan Royals', 'Royal Challengers Bangalore', 'Sunrisers Hyderabad'
    ]
  def encode(team_name):
    return [1 if team_name==t in teams else 0 for t in teams]

  prediction_array = encode(batting_team) + encode(bowling_team)

  prediction_array += [runs, wickets, overs, runs_last_5, wickets_last_5]

  prediction_array = np.array([prediction_array])

  pred = model.predict(prediction_array)

  return int(round(pred[0]))

In [32]:
batting_team='Delhi Daredevils'
bowling_team='Chennai Super Kings'
score = predict_score(batting_team, bowling_team, overs=10.2, runs=68, wickets=3, runs_last_5=29, wickets_last_5=1)
print(f'Predicted Score : {score} || Actual Score : 147')

Predicted Score : 150 || Actual Score : 147


