In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
from sklearn.pipeline import Pipeline


: 

In [None]:
matches = pd.read_csv('IPL_Matches_2008_2022.csv')
delivery = pd.read_csv('IPL_Ball_by_Ball_2008_2022.csv')

# Dataset Creation

In [None]:
matches.head(2)

In [None]:
delivery.head(2)

##LE

In [None]:
# # prompt: apply label encoder on ID column

# # Assuming 'matches' DataFrame is already loaded as shown in the previous code

# le = LabelEncoder()
# matches['ID'] = le.fit_transform(matches['ID'])
# delivery['ID'] = le.transform(delivery['ID'])

In [None]:
# matches[matches['ID'] == 1]

In [None]:
# delivery[delivery['ID'] == 1].head(124)

##Feature extract

In [None]:
matches.columns

In [None]:
matches = matches[['ID','City','WinningTeam','Team1','Team2']]

In [None]:
delivery.columns

In [None]:
delivery = delivery[['ID','innings','overs','ballnumber','total_run','player_out','isWicketDelivery','BattingTeam']]

##inning 1 runs of matches like target_runs added -> match_df

In [None]:
# total_runs per inning
total_score_df = delivery.groupby(['ID','innings']).sum()['total_run'].reset_index()
total_score_df.head(2)

In [None]:
# getting for only first inning
total_score_df = total_score_df[total_score_df['innings'] == 1]
total_score_df.head(2)

In [None]:
# adding this total_run column & matches columns to new match_df
match_df = matches.merge(total_score_df[['ID','total_run']],left_on='ID',right_on='ID')
match_df.head(2)

##match_df + delivery => delivery_df

In [None]:
delivery_df = match_df.merge(delivery,on='ID')

##teams & teamsname correction

In [None]:
match_df['Team1'].unique()

In [None]:
# changing name of teams which exists with different name
match_df['Team1'] = match_df['Team1'].str.replace('Delhi Daredevils','Delhi Capitals')
match_df['Team2'] = match_df['Team2'].str.replace('Delhi Daredevils','Delhi Capitals')

match_df['Team1'] = match_df['Team1'].str.replace('Kings XI Punjab','Punjab Kings')
match_df['Team2'] = match_df['Team2'].str.replace('Kings XI Punjab','Punjab Kings')

match_df['Team1'] = match_df['Team1'].str.replace('Gujarat Lions','Gujarat Titans')
match_df['Team2'] = match_df['Team2'].str.replace('Gujarat Lions','Gujarat Titans')

match_df['Team1'] = match_df['Team1'].str.replace('Deccan Chargers','Sunrisers Hyderabad')
match_df['Team2'] = match_df['Team2'].str.replace('Deccan Chargers','Sunrisers Hyderabad')

#removing old teams which are not playing currently
teams = ['Rajasthan Royals', 'Royal Challengers Bangalore',
       'Sunrisers Hyderabad', 'Delhi Capitals', 'Chennai Super Kings',
       'Gujarat Titans', 'Lucknow Super Giants', 'Kolkata Knight Riders',
       'Punjab Kings', 'Mumbai Indians']

match_df = match_df[(match_df['Team1'].isin(teams)) & (match_df['Team2'].isin(teams))]

# In delivery_df
delivery_df = delivery_df[(delivery_df['BattingTeam'].isin(teams))]

##NaN in delivery_df

In [None]:
delivery_df.isna().sum()

In [None]:
delivery_df = delivery_df.dropna(subset=['City'])
delivery_df = delivery_df.dropna(subset=['WinningTeam'])
delivery_df['player_out'] = delivery_df['player_out'].fillna('0') # means no wicket on this ball

##delivery_df for only inning=2 for current batting team prediction

In [None]:
# taking data for only second innings
delivery_df = delivery_df[delivery_df['innings'] == 2]

##creating some columns

###BallingTeam

In [None]:
# now i need one column called 'bowling_team' which calculated by if batting team == Team1 return Team2 else Team1
delivery_df['BallingTeam'] = np.where(delivery_df['BattingTeam'] == delivery_df['Team1'], delivery_df['Team2'], delivery_df['Team1'])


###current_score : current runs on each ball

In [None]:
# calculating current score on each ball
delivery_df['current_score'] = delivery_df.groupby('ID')['total_run_y'].cumsum()

###runs_left : how much runs left to win = target - current

In [None]:
# calculating runs_left to win BattingTeam
delivery_df['runs_left'] = delivery_df['total_run_x'] - delivery_df['current_score']

In [None]:
# prompt: somewhere runs_left column gives -ve values, so i want to find where is it

delivery_df[delivery_df['runs_left'] < 0].head(2)

# it is last ball of game where like team need 3 runs and player hit 6 so team get 6 runs at that time runs_left is giving -ve score
# but it can be helpful to denote winning to model

In [None]:
# # prompt: remove this columns where runs_left have -ve values

# delivery_df = delivery_df[delivery_df['runs_left'] >= 0]


###ballnumbercorrection
###balls_left : how many balls left from 20*6

In [None]:
delivery_df['ballnumber'].unique()

In [None]:
# Drop rows where ballnumber > 6
delivery_df = delivery_df[delivery_df['ballnumber'] <= 6]


In [None]:
# calculating balls_left by (total_balls+6) - (over*6 + current_ball_number)
delivery_df['balls_left'] = 120 - (delivery_df['overs']*6 + delivery_df['ballnumber'])

###wicket_left out of 10

In [None]:
# calculating wickets_left by 2 ways

In [None]:
# first way
delivery_df['isWicketDelivery'].unique()

In [None]:
delivery_df['wicket_left'] = 10 - delivery_df.groupby('ID')['isWicketDelivery'].cumsum()

In [None]:
# second way
delivery_df['player_out'] = delivery_df['player_out'].apply(lambda x: x if x == '0' else '1') # name assigned to '1'
delivery_df['player_out'] = delivery_df['player_out'].astype('int')
wickets = delivery_df.groupby('ID')['player_out'].cumsum()
delivery_df['wickets'] = 10 - wickets

In [None]:
delivery_df.shape

In [None]:
# Comparing 'wickets' and 'wicket_left' columns
comparison_result = delivery_df['wickets'] == delivery_df['wicket_left']
print(comparison_result.value_counts())


In [None]:
# so both method is same to find left wickets

###run_rates
####crr : current runrate<br>
####rrr : required runrate

In [None]:
# rrr would be infinite value because of last ball-> balls_left=0 and so that rrr will be infinity
# crr would be infinite value because of first ball-> balls_left=120 and so that crr will be infinity

# remove row of last ball
delivery_df = delivery_df[delivery_df['balls_left'] != 0]
# remove row of first ball
delivery_df = delivery_df[delivery_df['balls_left'] != 120]

In [None]:
# current_run_rate
# crr = current_runs / current_overs
delivery_df['crr'] = delivery_df['current_score'] * 6 / (120 - delivery_df['balls_left'])

In [None]:
# required_run_rate
# current rrr = (total_target_runs - current_runs) / (total_overs - current_over)
delivery_df['rrr'] = (delivery_df['runs_left'] * 6) / delivery_df['balls_left']

###result: in inning 2 - win(1) if BattingTeam == WinningTeam
because we are making prediction for second team

In [None]:
# calculating result because here data is of inning 2 so batting team is can be WinningTeam or not
delivery_df['result'] = np.where(delivery_df['WinningTeam'] == delivery_df['BattingTeam'], 1, 0)
delivery_df.head()

#final_df : by extracting needed column and shuffling rows for model from delivery_df

In [None]:
# creating final df using needed columns
final_df = delivery_df[['BattingTeam','BallingTeam','City','runs_left','balls_left','wicket_left','total_run_x','crr','rrr','result']]
final_df.head(2)

In [None]:
# here all data are arranged ball by ball so it can create bias, for this we are shuffling all rows
final_df = final_df.sample(final_df.shape[0])
final_df.head(2)

#model creation using pipeline and Logistic regression

In [None]:
X = final_df.iloc[:,:-1]
y = final_df.iloc[:,-1]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
trf1 = ColumnTransformer([
    ('ohe',OneHotEncoder(sparse_output=False,drop='first'),['BattingTeam','BallingTeam','City'])
],remainder='passthrough')

In [None]:
pipe = Pipeline(steps=[
        ('step1',trf1),
        ('step2',LogisticRegression(solver='liblinear'))
    ])

In [None]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))

In [None]:
pipe.predict_proba(X_test)[10]

In [None]:
pipe.predict(X_test)[10]

#visualization

In [None]:
final_df.head()

In [None]:
def match_summary(row):
    print("Batting Team : " + row['BattingTeam'] + " \nBowling Team : " + row['BallingTeam'] + " \nTarget : " + str(row['total_run_x']))


In [None]:
match_summary(X_test.iloc[4])

In [None]:
l = delivery_df['ID'].unique()

In [None]:
# choose id from here to visualize
np.sort(l)

In [None]:
def match_summary(row):
    print("Batting Team : " + row['BattingTeam'] + " \nBowling Team : " + row['BallingTeam'] + " \nTarget : " + str(row['total_run_x']))

In [None]:
def match_progression(x_df,match_id,pipe):

    t_df = x_df[x_df['ID'] == match_id] # fetching data of given id

    t_df = t_df[(t_df['ballnumber'] == 6)] # row of last ball of over

    temp_df = t_df[['BattingTeam','BallingTeam','City','runs_left','balls_left','wicket_left','total_run_x','crr','rrr']].dropna()
    temp_df = temp_df[temp_df['balls_left'] != 0] # row of last ball removed
    result = pipe.predict_proba(temp_df) # predict

    temp_df['lose'] = np.round(result.T[0]*100,1) # roundoff by 1 and fetching lose prob
    temp_df['win'] = np.round(result.T[1]*100,1) # roundoff by 1 and fetching win prob
    temp_df['end_of_over'] = range(1,temp_df.shape[0]+1) # 1 to 19 overs

    target = temp_df['total_run_x'].values[0]
    runs = list(temp_df['runs_left'].values)
    new_runs = runs[:]
    runs.insert(0,target)
    temp_df['runs_in_over'] = np.array(runs)[:-1] - np.array(new_runs)
    wickets = list(temp_df['wicket_left'].values)
    new_wickets = wickets[:]
    new_wickets.insert(0,10)
    wickets.append(0)
    w = np.array(wickets)
    nw = np.array(new_wickets)
    temp_df['wickets_in_over'] = (nw - w)[0:temp_df.shape[0]]

    match_summary(temp_df.iloc[-1])
    actual_result = t_df['result']
    print("Actual Result : ",actual_result.values[0])
    print("Prediction : ",pipe.predict(temp_df)[0])
    temp_df = temp_df[['end_of_over','runs_in_over','wickets_in_over','win','lose']]
    return temp_df,target


In [None]:
temp_df,target = match_progression(delivery_df,336028,pipe)
temp_df

In [None]:
# Corrected plotting code
import matplotlib.pyplot as plt

plt.figure(figsize=(18, 8))
plt.plot(temp_df['end_of_over'].to_numpy(), temp_df['wickets_in_over'].to_numpy(), color='black', linewidth=3)
plt.plot(temp_df['end_of_over'].to_numpy(), temp_df['win'].to_numpy(), color='#00a65a', linewidth=4)
plt.plot(temp_df['end_of_over'].to_numpy(), temp_df['lose'].to_numpy(), color='red', linewidth=4)
plt.bar(temp_df['end_of_over'].to_numpy(), temp_df['runs_in_over'].to_numpy())
plt.title('Target : ' + str(target))
plt.legend(["Wickets out", "Win Probability", "Lose Probability"])
plt.show()

In [None]:
# import matplotlib.pyplot as plt
# plt.figure(figsize=(18,8))
# plt.plot(temp_df['end_of_over'],temp_df['wickets_in_over'],color='black',linewidth=3)
# plt.plot(temp_df['end_of_over'],temp_df['win'],color='#00a65a',linewidth=4)
# plt.plot(temp_df['end_of_over'],temp_df['lose'],color='red',linewidth=4)
# plt.bar(temp_df['end_of_over'],temp_df['runs_in_over'])
# plt.title('Target : ' + str(target))
# plt.legend(["Wickets out","Win Probability","Lose Probability"])
# plt.show()

In [None]:
pickle.dump(pipe,open('model.pkl','wb'))

In [None]:
final_df['City'].unique()

In [None]:
final_df['BattingTeam'].unique()

In [None]:
final_df.columns