In [24]:
from pathlib import Path
import pandas as pd

root_dir = Path.cwd().parent
metadata_path = Path(root_dir, "data/raw_data/metadata.txt")
if not metadata_path.exists():
    raise FileNotFoundError(f"Missing metadata file: {metadata_path}")

records = []
with metadata_path.open("r", encoding="utf-8") as fp:
    for raw_line in fp:
        line = raw_line.strip()
        if not line:
            continue
        parts = [segment.strip() for segment in line.split(" - ", 5)]
        if len(parts) != 6:
            raise ValueError(f"Unexpected metadata format: {line}")
        match_date, competition_level, competition_name, gender, match_id, matchup = parts
        team_tokens = matchup.replace(" v ", " vs ").split(" vs ", 1)
        team1 = team_tokens[0].strip()
        team2 = team_tokens[1].strip() if len(team_tokens) > 1 else None
        records.append(
            {
                "match_date": match_date,
                "competition_level": competition_level,
                "competition_name": competition_name,
                "gender": gender,
                "match_id": int(match_id),
                "team1": team1,
                "team2": team2,
            }
        )

metadata_df = pd.DataFrame(records)
metadata_df["match_date"] = pd.to_datetime(metadata_df["match_date"], format="%Y-%m-%d")

processed_dir = Path("data/processed")
processed_dir.mkdir(parents=True, exist_ok=True)
metadata_out_path = processed_dir / "metadata.csv"
metadata_df.to_csv(metadata_out_path, index=False)
metadata_df.head()


Unnamed: 0,match_date,competition_level,competition_name,gender,match_id,team1,team2
0,2025-06-03,club,IPL,male,1473511,Royal Challengers Bengaluru,Punjab Kings
1,2025-06-01,club,IPL,male,1473510,Mumbai Indians,Punjab Kings
2,2025-05-30,club,IPL,male,1473509,Mumbai Indians,Gujarat Titans
3,2025-05-29,club,IPL,male,1473508,Punjab Kings,Royal Challengers Bengaluru
4,2025-05-27,club,IPL,male,1473507,Lucknow Super Giants,Royal Challengers Bengaluru
