In [2]:
import pandas as pd
import numpy as np
import re


In [4]:
ball_df = pd.read_csv("/workspaces/SuperApp/datasets/each_ball_records.csv")
match_df = pd.read_csv("/workspaces/SuperApp/datasets/each_match_records.csv")

print("Ball-level data shape:", ball_df.shape)
print("Match-level data shape:", match_df.shape)

Ball-level data shape: (17863, 9)
Match-level data shape: (74, 18)


In [5]:
display(ball_df.head())
display(match_df.head())

Unnamed: 0,match_no,ballnumber,inningno,over,outcome,batter,bowler,comment,score
0,1,1,1,0.1,0,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, no run,",0
1,1,2,1,0.2,1lb,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, 1 leg bye,",1
2,1,3,1,0.3,0,Ruturaj Gaikwad,Mohammed Shami,"Mohammed Shami to Ruturaj Gaikwad, no run,",0
3,1,4,1,0.4,1,Ruturaj Gaikwad,Mohammed Shami,"Mohammed Shami to Ruturaj Gaikwad, 1 run,",1
4,1,5,1,0.5,0,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, no run,",0


Unnamed: 0,season,date,match_number,match_type,venue,location,team1,team2,toss_won,toss_decision,umpire1,umpire2,reserve_umpire,match_referee,winner,winner_runs,winner_wickets,man_of_match
0,2023,31-03-2023,1,Group,Narendra Modi Stadium,Ahmedabad,Chennai Super Kings,Gujarat Titans,Gujarat Titans,field,Nitin Menon,HAS Khalid,A Bengeri,J Srinath,Gujarat Titans,,5.0,Rashid Khan
1,2023,01-04-2023,2,Group,Punjab Cricket Association IS Bindra Stadium,Chandigarh,Punjab Kings,Kolkata Knight Riders,Kolkata Knight Riders,field,BNJ Oxenford,YC Barde,PM Joshi,M Nayyar,Punjab Kings,7.0,,Arshdeep Singh
2,2023,01-04-2023,3,Group,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,Lucknow,Lucknow Super Giants,Delhi Capitals,Delhi Capitals,field,AK Chaudhary,NA Patwardhan,M Kuppuraj,DS Manohar,Lucknow Super Giants,50.0,,MA Wood
3,2023,02-04-2023,4,Group,Rajiv Gandhi International Stadium,Hyderabad,Rajasthan Royals,Sunrisers Hyderabad,Sunrisers Hyderabad,field,KN Ananthapadmanabhan,R Pandit,Abhijit Bhattacharya,V Narayan Kutty,Rajasthan Royals,72.0,,JC Buttler
4,2023,02-04-2023,5,Group,M Chinnaswamy Stadium,Bengaluru,Mumbai Indians,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Nitin Menon,Tapan Sharma,A Bengeri,J Srinath,Royal Challengers Bangalore,,8.0,F du Plessis


In [8]:
match_df["winner_runs"] = match_df["winner_runs"].fillna(0)
match_df["winner_wickets"] = match_df["winner_wickets"].fillna(0)

match_df = match_df.dropna(subset=["winner"])

In [9]:
def parse_outcome(outcome):
    """
    Convert cricket outcome string → runs, wicket flag
    """
    if pd.isna(outcome):
        return 0, 0

    # Check if it's a wicket
    if outcome.upper().startswith("W"):
        return 0, 1

    # Extract digits (like 1, 2, 4, 6)
    match = re.search(r"\d+", str(outcome))
    if match:
        runs = int(match.group())
    else:
        runs = 0

    return runs, 0

ball_df[["runs", "wicket"]] = ball_df["outcome"].apply(
    lambda x: pd.Series(parse_outcome(x))
)

In [10]:
ball_df["over_number"] = ball_df["over"].astype(str).str.split(".").str[0].astype(int)
ball_df["ball_in_over"] = ball_df["over"].astype(str).str.split(".").str[1].astype(int)

# Cumulative stats within match/inning
ball_df["cumulative_runs"] = ball_df.groupby(["match_no","inningno"])["runs"].cumsum()
ball_df["cumulative_wickets"] = ball_df.groupby(["match_no","inningno"])["wicket"].cumsum()

# Encode toss impact
match_df["toss_winner_won"] = (match_df["toss_won"] == match_df["winner"]).astype(int)


In [11]:
merged_df = pd.merge(
    ball_df,
    match_df,
    left_on="match_no",
    right_on="match_number",
    how="left"
)

print("Merged dataset shape:", merged_df.shape)
display(merged_df.head())


Merged dataset shape: (17863, 34)


Unnamed: 0,match_no,ballnumber,inningno,over,outcome,batter,bowler,comment,score,runs,...,toss_decision,umpire1,umpire2,reserve_umpire,match_referee,winner,winner_runs,winner_wickets,man_of_match,toss_winner_won
0,1,1,1,0.1,0,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, no run,",0,0,...,field,Nitin Menon,HAS Khalid,A Bengeri,J Srinath,Gujarat Titans,0.0,5.0,Rashid Khan,1
1,1,2,1,0.2,1lb,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, 1 leg bye,",1,1,...,field,Nitin Menon,HAS Khalid,A Bengeri,J Srinath,Gujarat Titans,0.0,5.0,Rashid Khan,1
2,1,3,1,0.3,0,Ruturaj Gaikwad,Mohammed Shami,"Mohammed Shami to Ruturaj Gaikwad, no run,",0,0,...,field,Nitin Menon,HAS Khalid,A Bengeri,J Srinath,Gujarat Titans,0.0,5.0,Rashid Khan,1
3,1,4,1,0.4,1,Ruturaj Gaikwad,Mohammed Shami,"Mohammed Shami to Ruturaj Gaikwad, 1 run,",1,1,...,field,Nitin Menon,HAS Khalid,A Bengeri,J Srinath,Gujarat Titans,0.0,5.0,Rashid Khan,1
4,1,5,1,0.5,0,Devon Conway,Mohammed Shami,"Mohammed Shami to Devon Conway, no run,",0,0,...,field,Nitin Menon,HAS Khalid,A Bengeri,J Srinath,Gujarat Titans,0.0,5.0,Rashid Khan,1


In [13]:
import os

os.makedirs("data", exist_ok=True)
merged_df.to_csv("data/cleaned_match_data.csv", index=False)
print("✅ Cleaned dataset saved: data/cleaned_match_data.csv")

✅ Cleaned dataset saved: data/cleaned_match_data.csv
