In [2]:
import pandas as pd
import numpy as np
import os
import pickle

from tqdm import tqdm
from yaml import safe_load

In [7]:
# getting a list of filenames along with its path
filenames = []

for i in os.listdir(r'C:\Users\shubh\Desktop\ipl'):
    filenames.append(os.path.join(r'C:\Users\shubh\Desktop\ipl', i))

In [9]:
# extracting each file and putting it in dataframe
final_df_list = []
counter = 1

for i in tqdm(filenames):
    with open(i, 'r') as f:
        df = pd.json_normalize(safe_load(f))
        df['match_id'] = counter
        final_df_list.append(df)
        counter += 1
        
final_df = pd.concat(final_df_list, ignore_index = True)

100%|████████████████████████████████████████████████████████████████████████████████| 816/816 [02:40<00:00,  5.07it/s]


In [20]:
final_df.columns

Index(['innings', 'meta.data_version', 'meta.created', 'meta.revision',
       'info.city', 'info.competition', 'info.dates', 'info.gender',
       'info.match_type', 'info.outcome.by.runs', 'info.outcome.winner',
       'info.overs', 'info.player_of_match', 'info.teams',
       'info.toss.decision', 'info.toss.winner', 'info.umpires', 'info.venue',
       'match_id', 'info.outcome.by.wickets', 'info.outcome.eliminator',
       'info.outcome.result', 'info.outcome.method', 'info.neutral_venue'],
      dtype='object')

In [13]:
pickle.dump(final_df, open('raw_ipl.pkl', 'wb'))

In [38]:
raw_df = pickle.load(open('raw_ipl.pkl', 'rb'))

In [43]:
raw_df = raw_df[['innings', 'info.city', 'info.teams', 'info.venue', 'match_id']]

In [44]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   innings     816 non-null    object
 1   info.city   765 non-null    object
 2   info.teams  816 non-null    object
 3   info.venue  816 non-null    object
 4   match_id    816 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 32.0+ KB


In [46]:
# extracting data from 'innings' column and creating new dataframe
delivery_df_list = []
count = 1

for index, row in raw_df.iterrows():
    if count in [75,108,150,180,268,360,443,458,584,748,982,1052,1111,1226,1345]:
        count+=1
        continue
    count+=1
    ball_of_match = []
    batsman = []
    bowler = []
    runs = []
    player_of_dismissed = []
    teams = []
    batting_team = []
    match_id = []
    city = []
    venue = []
    for ball in row['innings'][0]['1st innings']['deliveries']:
        for key in ball.keys():
            match_id.append(count)
            batting_team.append(row['innings'][0]['1st innings']['team'])
            teams.append(row['info.teams'])
            ball_of_match.append(key)
            batsman.append(ball[key]['batsman'])
            bowler.append(ball[key]['bowler'])
            runs.append(ball[key]['runs']['total'])
            city.append(row['info.city'])
            venue.append(row['info.venue'])
            try:
                player_of_dismissed.append(ball[key]['wicket']['player_out'])
            except:
                player_of_dismissed.append('0')    
            
    loop_df = pd.DataFrame({
            'match_id':match_id,
            'teams':teams,
            'batting_team':batting_team,
            'ball':ball_of_match,
            'runs':runs,
            'player_dismissed':player_of_dismissed,
            'city':city,
            'venue':venue
    })

    # Append loop_df to delivery_df_list
    delivery_df_list.append(loop_df)

# Concatenate all DataFrames in delivery_df_list into a single DataFrame
df = pd.concat(delivery_df_list, ignore_index=True)


In [50]:
pickle.dump(df, open('IPL.pkl', 'wb'))

In [180]:
df = pickle.load(open('IPL.pkl', 'rb'))

In [181]:
def bowling_team_extractor(row):
    for i in row['teams']:
        if i != row['batting_team']:
            return i

In [182]:
df['bowling_team'] = df.apply(bowling_team_extractor, axis = 1)

In [183]:
eligible_teams = ['Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore', 'Chennai Super Kings', 'Kolkata Knight Riders', 'Rajasthan Royals', 'Sunrisers Hyderabad', 'Delhi Capitals']

df = df[df['batting_team'].isin(eligible_teams) & df['bowling_team'].isin(eligible_teams)]

In [184]:
df.loc[df['city'].isnull(), 'city'] = df.loc[df['city'].isnull(), 'venue'].apply(lambda x: x.split(' ')[0])

In [185]:
df.drop(['venue', 'teams'], axis = 1, inplace = True)

In [186]:
df = df.groupby('match_id')['runs'].sum().reset_index().merge(df, on = 'match_id')

In [187]:
df.rename(columns = {'runs_x' : 'total_runs', 'runs_y' : 'runs'}, inplace = True)

In [188]:
df['current_score'] = df.groupby('match_id')['runs'].cumsum()

In [189]:
df.loc[df['player_dismissed'] != '0', 'player_dismissed'] = 1
df.loc[df['player_dismissed'] == '0', 'player_dismissed'] = 0

In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60426 entries, 0 to 60425
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   match_id          60426 non-null  int64  
 1   total_runs        60426 non-null  int64  
 2   batting_team      60426 non-null  object 
 3   ball              60426 non-null  float64
 4   runs              60426 non-null  int64  
 5   player_dismissed  60426 non-null  object 
 6   city              60426 non-null  object 
 7   bowling_team      60426 non-null  object 
 8   current_score     60426 non-null  int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 4.1+ MB


In [191]:
df['player_dismissed'] = df['player_dismissed'].astype(int)

In [192]:
df['wickets_left'] = 10 - (df.groupby('match_id')['player_dismissed'].cumsum())

In [193]:
df[df['match_id'] == 2].head()

Unnamed: 0,match_id,total_runs,batting_team,ball,runs,player_dismissed,city,bowling_team,current_score,wickets_left
0,2,207,Sunrisers Hyderabad,0.1,0,0,Hyderabad,Royal Challengers Bangalore,0,10
1,2,207,Sunrisers Hyderabad,0.2,0,0,Hyderabad,Royal Challengers Bangalore,0,10
2,2,207,Sunrisers Hyderabad,0.3,4,0,Hyderabad,Royal Challengers Bangalore,4,10
3,2,207,Sunrisers Hyderabad,0.4,0,0,Hyderabad,Royal Challengers Bangalore,4,10
4,2,207,Sunrisers Hyderabad,0.5,2,0,Hyderabad,Royal Challengers Bangalore,6,10


In [194]:
df['over_no'] = df['ball'].apply(lambda x: int(str(x).split('.')[0]))
df['ball_no'] = df['ball'].apply(lambda x: int(str(x).split('.')[1]))

In [195]:
df['balls_left'] = 120 - ((df['over_no'] * 6) + (df['ball_no']))

In [197]:
df.loc[:, 'crr'] = round((df['current_score'] / ((df['over_no'] * 6) + df['ball_no'])) * 6, 2)

In [201]:
df.loc[df['balls_left'] < 0, 'balls_left'] = 0

In [218]:
df['last_five'] = df.groupby('match_id')['runs'].rolling(window = 30).sum().values

In [222]:
df.columns

Index(['match_id', 'total_runs', 'batting_team', 'ball', 'runs',
       'player_dismissed', 'city', 'bowling_team', 'current_score',
       'wickets_left', 'over_no', 'ball_no', 'balls_left', 'crr', 'last_five'],
      dtype='object')

In [223]:
df.drop(['ball', 'runs', 'player_dismissed', 'over_no', 'ball_no', 'match_id'], axis = 1, inplace = True)

In [225]:
df.dropna(inplace = True)

In [252]:
df = df.sample(df.shape[0])

In [254]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_absolute_error

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [255]:
X = df.drop('total_runs', axis = 1)
y = df['total_runs']

In [264]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [256]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse_output = False, drop = 'first'), ['batting_team', 'city', 'bowling_team'])
], remainder = 'passthrough')

In [257]:
pipe_lr = Pipeline([
    ('ct', ct),
    ('ss', StandardScaler()),
    ('lr', LinearRegression())
])

In [260]:
pipe_rfr = Pipeline([
    ('ct', ct),
    ('ss', StandardScaler()),
    ('rfr', RandomForestRegressor())
])

In [261]:
pipe_xgbr = Pipeline([
    ('ct', ct),
    ('ss', StandardScaler()),
    ('xgbr', XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=1))
])

In [267]:
np.mean(cross_val_score(pipe_xgbr, X, y, cv=10, scoring='r2', n_jobs=-1, verbose=2, error_score='raise'))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.2min finished


0.9771244905480707

In [268]:
pipe_xgbr.fit(X_train, y_train)

In [269]:
print('Test score: ', r2_score(y_test, pipe_xgbr.predict(X_test)))
print('Train score: ', r2_score(y_train, pipe_xgbr.predict(X_train)))

Test score:  0.9724291034189558
Train score:  0.999956167784394
