In [24]:
from pathlib import Path
import pandas as pd

root_dir = Path.cwd().parent
metadata_path = Path(root_dir, "data/raw_data/metadata.txt")
if not metadata_path.exists():
    raise FileNotFoundError(f"Missing metadata file: {metadata_path}")

records = []
with metadata_path.open("r", encoding="utf-8") as fp:
    for raw_line in fp:
        line = raw_line.strip()
        if not line:
            continue
        parts = [segment.strip() for segment in line.split(" - ", 5)]
        if len(parts) != 6:
            raise ValueError(f"Unexpected metadata format: {line}")
        match_date, competition_level, competition_name, gender, match_id, matchup = parts
        team_tokens = matchup.replace(" v ", " vs ").split(" vs ", 1)
        team1 = team_tokens[0].strip()
        team2 = team_tokens[1].strip() if len(team_tokens) > 1 else None
        records.append(
            {
                "match_date": match_date,
                "competition_level": competition_level,
                "competition_name": competition_name,
                "gender": gender,
                "match_id": int(match_id),
                "team1": team1,
                "team2": team2,
            }
        )

metadata_df = pd.DataFrame(records)
metadata_df["match_date"] = pd.to_datetime(metadata_df["match_date"], format="%Y-%m-%d")

processed_dir = Path("data/processed")
processed_dir.mkdir(parents=True, exist_ok=True)
metadata_out_path = processed_dir / "metadata.csv"
metadata_df.to_csv(metadata_out_path, index=False)
metadata_df.head()


Unnamed: 0,match_date,competition_level,competition_name,gender,match_id,team1,team2
0,2025-06-03,club,IPL,male,1473511,Royal Challengers Bengaluru,Punjab Kings
1,2025-06-01,club,IPL,male,1473510,Mumbai Indians,Punjab Kings
2,2025-05-30,club,IPL,male,1473509,Mumbai Indians,Gujarat Titans
3,2025-05-29,club,IPL,male,1473508,Punjab Kings,Royal Challengers Bengaluru
4,2025-05-27,club,IPL,male,1473507,Lucknow Super Giants,Royal Challengers Bengaluru


In [None]:
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import json

raw_dir = Path(root_dir, "data/raw_data")
if not raw_dir.exists():
    raise FileNotFoundError(f"Missing raw data directory: {raw_dir}")

match_files = sorted(raw_dir.glob("*.json"))
if not match_files:
    raise FileNotFoundError(f"No match JSON files found in {raw_dir}")
print(f"Found {len(match_files)} match JSON files.")

for match_path in tqdm(match_files, desc="Processing matches"):
    # print(f"Using match JSON: {match_path.name}")

    with match_path.open("r", encoding="utf-8") as f:
        match = json.load(f)

    info = match.get("info", {})
    season = info.get("season")
    city = info.get("city")
    venue = info.get("venue")
    match_type = info.get("match_type")
    match_dates = info.get("dates", [])
    match_date = match_dates[0] if match_dates else None
    teams = info.get("teams", [])
    team1, team2 = (teams + [None, None])[:2]
    outcome = info.get("outcome", {})
    winner = outcome.get("winner")
    margin = outcome.get("by", {}) if isinstance(outcome, dict) else {}
    by_runs = margin.get("runs")
    by_wickets = margin.get("wickets")

    toss = info.get("toss", {})

    rows = []
    innings_list = match.get("innings", [])
    for innings_idx, inns in enumerate(innings_list, start=1):
        batting_team = inns.get("team")
        overs = inns.get("overs", [])
        for over_obj in overs:
            over_num = over_obj.get("over")
            for ball_idx, delivery in enumerate(over_obj.get("deliveries", []), start=1):
                runs = delivery.get("runs", {})
                extras_detail = delivery.get("extras", {}) or {}
                wickets = delivery.get("wickets", []) or []
                fielders_list = wickets[0].get("fielders", []) if wickets else []
                fielders = ", ".join([f.get("name", str(f)) for f in fielders_list]) if fielders_list else None

                if batting_team and team1 and team2:
                    bowling_team = team2 if batting_team == team1 else team1
                else:
                    bowling_team = None

                rows.append(
                    {
                        "match_id": match_path.stem,
                        "season": season,
                        "date": match_date,
                        "city": city,
                        "venue": venue,
                        "match_type": match_type,
                        "team1": team1,
                        "team2": team2,
                        "toss_winner": toss.get("winner"),
                        "toss_decision": toss.get("decision"),
                        "winner": winner,
                        "by_runs": by_runs,
                        "by_wickets": by_wickets,
                        "innings": innings_idx,
                        "batting_team": batting_team,
                        "bowling_team": bowling_team,
                        "over": over_num,
                        "ball_in_over": ball_idx,
                        "batter": delivery.get("batter"),
                        "non_striker": delivery.get("non_striker"),
                        "bowler": delivery.get("bowler"),
                        "runs_batter": runs.get("batter", 0),
                        "runs_extras": runs.get("extras", 0),
                        "runs_total": runs.get("total", runs.get("batter", 0) + runs.get("extras", 0)),
                        "wides": extras_detail.get("wides", 0),
                        "legbyes": extras_detail.get("legbyes", 0),
                        "byes": extras_detail.get("byes", 0),
                        "noballs": extras_detail.get("noballs", 0),
                        "penalty": extras_detail.get("penalty", 0),
                        "wicket": 1 if wickets else 0,
                        "wicket_kind": wickets[0].get("kind") if wickets else None,
                        "player_out": wickets[0].get("player_out") if wickets else None,
                        "fielders": fielders,
                    }
                )

    balls_df = pd.DataFrame(rows)
    if not balls_df.empty:
        balls_df["date"] = pd.to_datetime(balls_df["date"], errors="coerce")
        balls_df["innings"] = balls_df["innings"].astype("Int64")
        balls_df["over"] = balls_df["over"].astype("Int64")
        balls_df["ball_in_over"] = balls_df["ball_in_over"].astype("Int64")

    processed_dir = Path(root_dir, "data/processed/balls")
    processed_dir.mkdir(parents=True, exist_ok=True)
    out_path = processed_dir / f"{match_path.stem}.csv"
    balls_df.to_csv(out_path, index=False)

    # print(f"Wrote {len(balls_df)} deliveries to {out_path}")
    # balls_df.head()