# Imports

In [252]:
import numpy as np
import pandas as pd
from yaml import safe_load
import os
from tqdm import tqdm
import pickle

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

# Make DataFrame 

In [163]:
# filenames = []
# for file in os.listdir("Data"):
#     filenames.append(os.path.join("Data",file))

In [164]:
# filenames[0:5]

In [165]:
# final_df = pd.DataFrame()
# counter = 1

# data_frames = []

# for file in tqdm(filenames):
#     with open(file, 'r') as f:
#         df = pd.json_normalize(safe_load(f))
#         df["match_id"] = counter
#         data_frames.append(df)
#         counter += 1

# final_df = pd.concat(data_frames, ignore_index=True)
# print("Final DataFrame:\n", final_df.head())  # Print the first few rows of the final dataframe
# final_df

In [166]:
# backup = final_df.copy()

In [167]:
# final_df.head()

In [168]:
# pickle.dump(final_df,open('dataset.pkl','wb'))

In [169]:
df = pickle.load(open('dataset.pkl','rb'))

In [170]:
df

Unnamed: 0,innings,meta.data_version,meta.created,meta.revision,info.dates,info.gender,info.match_type,info.outcome.by.wickets,info.outcome.winner,info.overs,...,info.outcome.by.runs,info.match_type_number,info.neutral_venue,info.outcome.method,info.outcome.result,info.outcome.eliminator,info.supersubs.New Zealand,info.supersubs.South Africa,info.bowl_out,info.outcome.bowl_out
0,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-18,2,[2017-02-17],male,T20,5.0,Sri Lanka,20,...,,,,,,,,,,
1,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-19,2,[2017-02-19],male,T20,2.0,Sri Lanka,20,...,,,,,,,,,,
2,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2017-02-23,1,[2017-02-22],male,T20,,Australia,20,...,41.0,,,,,,,,,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",0.9,2016-09-12,1,[2016-09-05],male,T20,,Hong Kong,20,...,40.0,,,,,,,,,
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",0.9,2016-06-19,1,[2016-06-18],male,T20,,Zimbabwe,20,...,2.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1427,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",0.9,2016-03-05,2,[2016-03-04],male,T20,6.0,Pakistan,20,...,,,1.0,,,,,,,
1428,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",0.9,2016-03-08,1,[2016-03-06],male,T20,8.0,India,20,...,,,,,,,,,,
1429,"[{'1st innings': {'team': 'Netherlands', 'deli...",0.9,2016-02-03,1,[2016-02-03],male,T20,,Netherlands,20,...,84.0,,,,,,,,,
1430,"[{'1st innings': {'team': 'Australia', 'delive...",0.9,2016-09-12,1,[2016-09-06],male,T20,,Australia,20,...,85.0,,,,,,,,,


In [171]:
df.shape

(1432, 28)

## Remove unneccesary columns

In [172]:
df.columns

Index(['innings', 'meta.data_version', 'meta.created', 'meta.revision',
       'info.dates', 'info.gender', 'info.match_type',
       'info.outcome.by.wickets', 'info.outcome.winner', 'info.overs',
       'info.player_of_match', 'info.teams', 'info.toss.decision',
       'info.toss.winner', 'info.umpires', 'info.venue', 'match_id',
       'info.city', 'info.outcome.by.runs', 'info.match_type_number',
       'info.neutral_venue', 'info.outcome.method', 'info.outcome.result',
       'info.outcome.eliminator', 'info.supersubs.New Zealand',
       'info.supersubs.South Africa', 'info.bowl_out',
       'info.outcome.bowl_out'],
      dtype='object')

In [173]:
df.drop(columns=[
    'meta.data_version',
    'meta.created',
    'meta.revision',
    'info.outcome.bowl_out',
    'info.bowl_out',
    'info.supersubs.South Africa',
    'info.supersubs.New Zealand',
    'info.outcome.eliminator',
    'info.outcome.result',
    'info.outcome.method',
    'info.neutral_venue',
    'info.match_type_number',
    'info.outcome.by.runs',
    'info.outcome.by.wickets'
],inplace=True)

In [174]:
df.shape

(1432, 14)

## Consider only men's T20 matches

In [175]:
df["info.gender"].value_counts()

info.gender
male      966
female    466
Name: count, dtype: int64

In [176]:
df = df[df["info.gender"]=="male"]

In [177]:
df = df.drop(["info.gender"],axis=1)

In [178]:
df.shape

(966, 13)

## Consider only 20 over matches

In [179]:
df["info.match_type"].value_counts()

info.match_type
T20    966
Name: count, dtype: int64

In [180]:
df["info.overs"].value_counts()

info.overs
20    963
50      3
Name: count, dtype: int64

In [181]:
df = df[df["info.overs"]==20]

In [182]:
df = df.drop(["info.match_type","info.overs"],axis=1)

In [183]:
df.shape

(963, 11)

## Save DataFrame

In [184]:
# pickle.dump(df,open("matches.pkl","wb"))

## Create DataFrame for every ball

In [185]:
df

Unnamed: 0,innings,info.dates,info.outcome.winner,info.player_of_match,info.teams,info.toss.decision,info.toss.winner,info.umpires,info.venue,match_id,info.city
0,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-17],Sri Lanka,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Melbourne Cricket Ground,1,
1,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-19],Sri Lanka,[DAS Gunaratne],"[Australia, Sri Lanka]",field,Sri Lanka,"[SD Fry, SJ Nogajski]","Simonds Stadium, South Geelong",2,Victoria
2,"[{'1st innings': {'team': 'Australia', 'delive...",[2017-02-22],Australia,[A Zampa],"[Australia, Sri Lanka]",field,Sri Lanka,"[MD Martell, P Wilson]",Adelaide Oval,3,
3,"[{'1st innings': {'team': 'Hong Kong', 'delive...",[2016-09-05],Hong Kong,,"[Ireland, Hong Kong]",bat,Hong Kong,"[R Black, AJ Neill]","Bready Cricket Club, Magheramason",4,Londonderry
4,"[{'1st innings': {'team': 'Zimbabwe', 'deliver...",[2016-06-18],Zimbabwe,[E Chigumbura],"[Zimbabwe, India]",field,India,"[TJ Matibiri, RB Tiffin]",Harare Sports Club,5,
...,...,...,...,...,...,...,...,...,...,...,...
1427,"[{'1st innings': {'team': 'Sri Lanka', 'delive...",[2016-03-04],Pakistan,[Umar Akmal],"[Pakistan, Sri Lanka]",field,Pakistan,"[AK Chaudhary, Enamul Haque]",Shere Bangla National Stadium,1428,Mirpur
1428,"[{'1st innings': {'team': 'Bangladesh', 'deliv...",[2016-03-06],India,[S Dhawan],"[Bangladesh, India]",field,India,"[RSA Palliyaguruge, Shozab Raza]",Shere Bangla National Stadium,1429,Mirpur
1429,"[{'1st innings': {'team': 'Netherlands', 'deli...",[2016-02-03],Netherlands,[Mudassar Bukhari],"[United Arab Emirates, Netherlands]",field,United Arab Emirates,"[CK Nandan, Sarika Prasad]",ICC Academy,1430,Dubai
1430,"[{'1st innings': {'team': 'Australia', 'delive...",[2016-09-06],Australia,[GJ Maxwell],"[Sri Lanka, Australia]",field,Sri Lanka,"[REJ Martinesz, RR Wimalasiri]",Pallekele International Cricket Stadium,1431,


In [186]:
df.iloc[0]["innings"][0]["1st innings"]

{'team': 'Australia',
 'deliveries': [{0.1: {'batsman': 'AJ Finch',
    'bowler': 'SL Malinga',
    'non_striker': 'M Klinger',
    'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
  {0.2: {'batsman': 'AJ Finch',
    'bowler': 'SL Malinga',
    'non_striker': 'M Klinger',
    'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
  {0.3: {'batsman': 'AJ Finch',
    'bowler': 'SL Malinga',
    'non_striker': 'M Klinger',
    'runs': {'batsman': 1, 'extras': 0, 'total': 1}}},
  {0.4: {'batsman': 'M Klinger',
    'bowler': 'SL Malinga',
    'non_striker': 'AJ Finch',
    'runs': {'batsman': 2, 'extras': 0, 'total': 2}}},
  {0.5: {'batsman': 'M Klinger',
    'bowler': 'SL Malinga',
    'non_striker': 'AJ Finch',
    'runs': {'batsman': 0, 'extras': 0, 'total': 0}}},
  {0.6: {'batsman': 'M Klinger',
    'bowler': 'SL Malinga',
    'non_striker': 'AJ Finch',
    'runs': {'batsman': 3, 'extras': 0, 'total': 3}}},
  {1.1: {'batsman': 'M Klinger',
    'bowler': 'KMDN Kulasekara',
    'non_striker

In [187]:
count = 1
delivery_df_list = []  # Initialize a list to store DataFrames

for index, row in df.iterrows():
    if count in [75, 108, 150, 180, 268, 360, 443, 458, 584, 748, 982, 1052, 1111, 1226, 1345]:
        count += 1
        continue
    count += 1
    ball_of_match = []
    batsman = []
    bowler = []
    runs = []
    player_of_dismissed = []
    teams = []
    batting_team = []
    match_id = []
    city = []
    venue = []
    for ball in row['innings'][0]['1st innings']['deliveries']:
        for key in ball.keys():
            match_id.append(count)
            batting_team.append(row['innings'][0]['1st innings']['team'])
            teams.append(row['info.teams'])
            ball_of_match.append(key)
            batsman.append(ball[key]['batsman'])
            bowler.append(ball[key]['bowler'])
            runs.append(ball[key]['runs']['total'])
            city.append(row['info.city'])
            venue.append(row['info.venue'])
            try:
                player_of_dismissed.append(ball[key]['wicket']['player_out'])
            except KeyError:
                player_of_dismissed.append('0')
    loop_df = pd.DataFrame({
        'match_id': match_id,
        'teams': teams,
        'batting_team': batting_team,
        'ball': ball_of_match,
        'batsman': batsman,
        'bowler': bowler,
        'runs': runs,
        'player_dismissed': player_of_dismissed,
        'city': city,
        'venue': venue
    })
    delivery_df_list.append(loop_df)  # Append the loop_df to the list

# Concatenate all DataFrames in the list into a single DataFrame
delivery_df = pd.concat(delivery_df_list, ignore_index=True)

In [188]:
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue
0,2,"[Australia, Sri Lanka]",Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground
1,2,"[Australia, Sri Lanka]",Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground
2,2,"[Australia, Sri Lanka]",Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground
3,2,"[Australia, Sri Lanka]",Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground
4,2,"[Australia, Sri Lanka]",Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...,...,...
115320,964,"[Sri Lanka, Australia]",Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium
115321,964,"[Sri Lanka, Australia]",Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium
115322,964,"[Sri Lanka, Australia]",Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium
115323,964,"[Sri Lanka, Australia]",Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium


## Add bowling_team

In [189]:
def bowlingteam(row):
    for team in row["teams"]:
        if team != row["batting_team"]:
            return team

In [190]:
delivery_df["bowling_team"] = delivery_df.apply(bowlingteam,axis=1)
delivery_df

Unnamed: 0,match_id,teams,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,2,"[Australia, Sri Lanka]",Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,2,"[Australia, Sri Lanka]",Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,2,"[Australia, Sri Lanka]",Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,2,"[Australia, Sri Lanka]",Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,2,"[Australia, Sri Lanka]",Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...,...
115320,964,"[Sri Lanka, Australia]",Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium,Australia
115321,964,"[Sri Lanka, Australia]",Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium,Australia
115322,964,"[Sri Lanka, Australia]",Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium,Australia
115323,964,"[Sri Lanka, Australia]",Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium,Australia


In [191]:
delivery_df = delivery_df.drop("teams",axis=1)

In [192]:
delivery_df.shape

(115325, 10)

## Take only top 10 teams

In [193]:
delivery_df["batting_team"].unique()

array(['Australia', 'Hong Kong', 'Zimbabwe', 'India', 'Bangladesh',
       'New Zealand', 'South Africa', 'England', 'West Indies', 'Ireland',
       'Afghanistan', 'Pakistan', 'United Arab Emirates', 'Scotland',
       'Oman', 'Papua New Guinea', 'Sri Lanka', 'Netherlands', 'Nepal',
       'Vanuatu', 'Philippines', 'United States of America', 'Germany',
       'Ghana', 'Uganda', 'Kenya', 'Namibia', 'Nigeria', 'Botswana',
       'Guernsey', 'Denmark', 'Jersey', 'Italy', 'Norway', 'Thailand',
       'Malaysia', 'Maldives', 'Singapore', 'Kuwait', 'Bermuda', 'Canada',
       'Cayman Islands', 'Portugal', 'Gibraltar', 'Spain', 'Bhutan',
       'Qatar', 'Iran', 'Belgium', 'Isle of Man', 'Bulgaria', 'Romania'],
      dtype=object)

In [194]:
teams = [
    'Australia',
    'India',
    'Bangladesh',
    'New Zealand',
    'South Africa',
    'England',
    'West Indies',
    'Afghanistan',
    'Pakistan',
    'Sri Lanka'    
]

In [195]:
delivery_df = delivery_df[delivery_df["batting_team"].isin(teams)]
delivery_df = delivery_df[delivery_df["bowling_team"].isin(teams)]

In [196]:
delivery_df

Unnamed: 0,match_id,batting_team,ball,batsman,bowler,runs,player_dismissed,city,venue,bowling_team
0,2,Australia,0.1,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
1,2,Australia,0.2,AJ Finch,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
2,2,Australia,0.3,AJ Finch,SL Malinga,1,0,,Melbourne Cricket Ground,Sri Lanka
3,2,Australia,0.4,M Klinger,SL Malinga,2,0,,Melbourne Cricket Ground,Sri Lanka
4,2,Australia,0.5,M Klinger,SL Malinga,0,0,,Melbourne Cricket Ground,Sri Lanka
...,...,...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,19.3,SMSM Senanayake,MA Starc,1,0,Colombo,R Premadasa Stadium,Australia
115321,964,Sri Lanka,19.4,DM de Silva,MA Starc,0,0,Colombo,R Premadasa Stadium,Australia
115322,964,Sri Lanka,19.5,DM de Silva,MA Starc,0,DM de Silva,Colombo,R Premadasa Stadium,Australia
115323,964,Sri Lanka,19.6,SMSM Senanayake,MA Starc,2,0,Colombo,R Premadasa Stadium,Australia


In [197]:
df = delivery_df[['match_id','batting_team','bowling_team','ball','runs','player_dismissed','city','venue']]

In [198]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,2,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground
1,2,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground
2,2,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground
3,2,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground
4,2,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
115322,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium


In [199]:
# pickle.dump(df,open("extracted_data.pkl","wb"))

# Feature Extraction

## What we want to train a model

* batting_team
* bowling_team
* current_score
* wickects_left
* current_run_rate
* city
* balls_left
* last_five

In [200]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,venue
0,2,Australia,Sri Lanka,0.1,0,0,,Melbourne Cricket Ground
1,2,Australia,Sri Lanka,0.2,0,0,,Melbourne Cricket Ground
2,2,Australia,Sri Lanka,0.3,1,0,,Melbourne Cricket Ground
3,2,Australia,Sri Lanka,0.4,2,0,,Melbourne Cricket Ground
4,2,Australia,Sri Lanka,0.5,0,0,,Melbourne Cricket Ground
...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo,R Premadasa Stadium
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo,R Premadasa Stadium
115322,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,R Premadasa Stadium
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo,R Premadasa Stadium


In [201]:
df.isna().sum()

match_id               0
batting_team           0
bowling_team           0
ball                   0
runs                   0
player_dismissed       0
city                8548
venue                  0
dtype: int64

## Fill city with help of venue

In [202]:
df[df["city"].isnull()]["venue"].value_counts()

venue
Dubai International Cricket Stadium        2969
Pallekele International Cricket Stadium    2066
Melbourne Cricket Ground                   1453
Sydney Cricket Ground                       749
Adelaide Oval                               498
Harare Sports Club                          372
Sharjah Cricket Stadium                     249
Sylhet International Cricket Stadium        128
Carrara Oval                                 64
Name: count, dtype: int64

In [203]:
# np.where(condition, true, false)
cities = np.where(df["city"].isnull(),df["venue"].str.split().apply(lambda x:x[0]),df["city"])

In [204]:
cities

array(['Melbourne', 'Melbourne', 'Melbourne', ..., 'Colombo', 'Colombo',
       'Colombo'], dtype=object)

In [205]:
df["city"] = cities

In [206]:
df.isna().sum()

match_id            0
batting_team        0
bowling_team        0
ball                0
runs                0
player_dismissed    0
city                0
venue               0
dtype: int64

In [207]:
df = df.drop("venue",axis=1)

In [208]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne
...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo
115322,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo


In [209]:
df.city.value_counts()

city
Colombo          4086
Mirpur           3420
Johannesburg     3331
Dubai            2969
Auckland         2532
                 ... 
Nairobi           123
Potchefstroom     122
Dharamsala        122
Ahmedabad         121
Carrara            64
Name: count, Length: 86, dtype: int64

## Drop cities(rows) having less than or equal to 5 matches 

In [210]:
eligible_cities = df["city"].value_counts()[df["city"].value_counts() > 600].index.tolist()

In [211]:
df = df[df["city"].isin(eligible_cities)]

In [212]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne
...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo
115322,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo


## Extract other features

In [213]:
df["current_score"] = df.groupby("match_id")["runs"].cumsum()

In [214]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3
...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo,125
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo,125
115322,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo,127


In [215]:
df["over"] = df["ball"].apply(lambda x: str(x).split(".")[0])
df["ball_no"] = df["ball"].apply(lambda x: str(x).split(".")[1])

In [216]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5
...,...,...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4
115322,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,19,5
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6


In [217]:
df['balls_bowled'] = (df['over'].astype('int')*6) + df['ball_no'].astype('int')
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5
...,...,...,...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118
115322,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,19,5,119
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120


In [218]:
df['balls_left'] = 120 - df['balls_bowled']
df['balls_left'] = df['balls_left'].apply(lambda x:0 if x<0 else x)
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115
...,...,...,...,...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117,3
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118,2
115322,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,19,5,119,1
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120,0


In [219]:
df["match_id"].dtype

dtype('int64')

In [220]:
# # Convert 'player_dismissed' to 0 or 1
# df['player_dismissed'] = df['player_dismissed'].apply(lambda x: 0 if x == '0' else 1)

# # Ensure 'player_dismissed' is of integer type
# df['player_dismissed'] = df['player_dismissed'].astype(int)

# # Group by 'match_id' and calculate the cumulative sum of 'player_dismissed'
# df['player_dismissed_cumsum'] = df.groupby('match_id')['player_dismissed'].cumsum()

# # Calculate 'wickets_left'
# df['wickets_left'] = 10 - df['player_dismissed_cumsum']

# # Drop the intermediate 'player_dismissed_cumsum' column
# df.drop(columns=['player_dismissed_cumsum'], inplace=True)


In [221]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115
...,...,...,...,...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117,3
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118,2
115322,964,Sri Lanka,Australia,19.5,0,DM de Silva,Colombo,125,19,5,119,1
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120,0


In [224]:
unique_values = df['player_dismissed'].unique()
print(unique_values)


[0 1]


In [223]:
df['player_dismissed'] = df['player_dismissed'].apply(lambda x: 0 if x == '0' else 1)


In [225]:
df["player_dismissed"].dtype

dtype('int64')

In [226]:
df['player_dismissed'] = df['player_dismissed'].astype(int)


In [227]:
df['player_dismissed_cumsum'] = df.groupby('match_id')['player_dismissed'].cumsum()


In [229]:
df['wickets_left'] = 10 - df['player_dismissed_cumsum']


In [231]:
df.drop(columns=['player_dismissed_cumsum'], inplace=True)


In [234]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,10
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,10
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,10
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,10
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117,3,2
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118,2,2
115322,964,Sri Lanka,Australia,19.5,0,1,Colombo,125,19,5,119,1,1
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120,0,1


In [235]:
df['crr'] = (df['current_score']*6)/df['balls_bowled']

In [236]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left,crr
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,10,0.000000
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,10,0.000000
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,10,2.000000
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,10,4.500000
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,10,3.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117,3,2,6.410256
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118,2,2,6.355932
115322,964,Sri Lanka,Australia,19.5,0,1,Colombo,125,19,5,119,1,1,6.302521
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120,0,1,6.350000


In [239]:
# Ensure 'runs' column is numeric
df['runs'] = pd.to_numeric(df['runs'], errors='coerce')

# Group by 'match_id'
groups = df.groupby('match_id')

# Get unique match IDs
match_ids = df['match_id'].unique()

# Initialize list to store rolling sums
last_five = []

# Calculate the rolling sum for each match
for id in match_ids:
    match_data = groups.get_group(id)
    rolling_sums = match_data['runs'].rolling(window=30).sum().values.tolist()
    last_five.extend(rolling_sums)


In [240]:
df['last_five'] = last_five

In [241]:
df

Unnamed: 0,match_id,batting_team,bowling_team,ball,runs,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left,crr,last_five
0,2,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,10,0.000000,
1,2,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,10,0.000000,
2,2,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,10,2.000000,
3,2,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,10,4.500000,
4,2,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,10,3.600000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115320,964,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117,3,2,6.410256,32.0
115321,964,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118,2,2,6.355932,32.0
115322,964,Sri Lanka,Australia,19.5,0,1,Colombo,125,19,5,119,1,1,6.302521,32.0
115323,964,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120,0,1,6.350000,33.0


In [242]:
final_df = df.groupby('match_id').sum()['runs'].reset_index().merge(df,on='match_id')

In [243]:
final_df

Unnamed: 0,match_id,runs_x,batting_team,bowling_team,ball,runs_y,player_dismissed,city,current_score,over,ball_no,balls_bowled,balls_left,wickets_left,crr,last_five
0,2,168,Australia,Sri Lanka,0.1,0,0,Melbourne,0,0,1,1,119,10,0.000000,
1,2,168,Australia,Sri Lanka,0.2,0,0,Melbourne,0,0,2,2,118,10,0.000000,
2,2,168,Australia,Sri Lanka,0.3,1,0,Melbourne,1,0,3,3,117,10,2.000000,
3,2,168,Australia,Sri Lanka,0.4,2,0,Melbourne,3,0,4,4,116,10,4.500000,
4,2,168,Australia,Sri Lanka,0.5,0,0,Melbourne,3,0,5,5,115,10,3.600000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50496,964,128,Sri Lanka,Australia,19.3,1,0,Colombo,125,19,3,117,3,2,6.410256,32.0
50497,964,128,Sri Lanka,Australia,19.4,0,0,Colombo,125,19,4,118,2,2,6.355932,32.0
50498,964,128,Sri Lanka,Australia,19.5,0,1,Colombo,125,19,5,119,1,1,6.302521,32.0
50499,964,128,Sri Lanka,Australia,19.6,2,0,Colombo,127,19,6,120,0,1,6.350000,33.0


In [244]:
final_df=final_df[['batting_team','bowling_team','city','current_score','balls_left','wickets_left','crr','last_five','runs_x']]

In [245]:
final_df.dropna(inplace=True)

In [246]:
final_df.isnull().sum()

batting_team     0
bowling_team     0
city             0
current_score    0
balls_left       0
wickets_left     0
crr              0
last_five        0
runs_x           0
dtype: int64

In [247]:
final_df = final_df.sample(final_df.shape[0])

In [248]:
final_df

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
26139,Pakistan,Australia,St Lucia,87,57,9,8.285714,49.0,191
9320,Australia,Sri Lanka,Adelaide,233,0,8,11.553719,54.0,233
28218,India,South Africa,Durban,107,46,8,8.675676,40.0,168
17452,Bangladesh,Pakistan,Cape Town,57,77,9,7.953488,42.0,140
21466,Sri Lanka,New Zealand,Nottingham,104,38,7,7.609756,35.0,158
...,...,...,...,...,...,...,...,...,...
50418,Sri Lanka,Australia,Colombo,51,77,7,7.116279,37.0,128
30655,Sri Lanka,Pakistan,Abu Dhabi,61,85,8,10.457143,47.0,141
47457,Pakistan,India,Kolkata,106,21,5,6.424242,46.0,118
18455,New Zealand,South Africa,Johannesburg,76,47,6,6.246575,26.0,129


In [249]:
# pickle.dump(final_df,open("final_df.pkl","wb"))

In [253]:
X = final_df.drop("runs_x",axis=1)
y = final_df["runs_x"]

X.shape,y.shape

((38477, 8), (38477,))

In [254]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((30781, 8), (7696, 8), (30781,), (7696,))

In [257]:
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output=False,drop='first'),['batting_team','bowling_team','city'])
]
,remainder='passthrough')

In [258]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=1))
])

In [259]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(r2_score(y_test,y_pred))
print(mean_absolute_error(y_test,y_pred))

0.9884939493793695
1.627498237605898


In [260]:
# pickle.dump(pipe,open('pipe.pkl','wb'))