In [1]:
import requests
import json
import pandas as pd
import os
import re
import time
from tqdm import tqdm
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
import matplotlib.pyplot as plt
import numpy as np
from glob import glob

In [6]:
# --- Constants ---
SKIP_TYPES = {"Half End", "Half Start", "Referee Ball Drop", "Starting XI"}

# --- Helpers ---
def safe_key(s: str) -> str:
    return re.sub(r"\W+", "_", (s or "").strip())

def parse_match_id_from_filename(path: str) -> str:
    m = re.search(r"_([0-9]+)\.json$", os.path.basename(path))
    return m.group(1) if m else "unknown"

# --- Core logic (updated to also track players) ---
def build_team_sequences(events):
    """
    Collects sequences of type.name for each team, split by team switches,
    and also collects the corresponding player names in the same order.
    Returns:
      sequences: { team_id: [ {"words":[...], "players":[...]}, ... ] }
      id_to_name: { team_id: team_name }
    """
    id_to_name = {}
    sequences = {}
    current_team = None
    buffer_words = []
    buffer_players = []

    for ev in events:
        team = ev.get("team") or {}
        team_id = team.get("id")
        team_name = team.get("name")
        if team_id is None or team_name is None:
            continue

        id_to_name[team_id] = team_name

        t = ev.get("type")
        tname = t.get("name") if isinstance(t, dict) else None
        # Normalize "Ball Receipt*"
        if tname == "Ball Receipt*":
            tname = "Ball Receipt"

        # Skip unwanted event types
        if tname in SKIP_TYPES or tname is None:
            continue

        # Player name (align one-to-one with the event in words)
        player_name = (ev.get("player") or {}).get("name") or "Unknown"

        if current_team is None:
            # First seen team
            current_team = team_id
            buffer_words.append(tname)
            buffer_players.append(player_name)
            continue

        if team_id == current_team:
            # Same team → keep accumulating
            buffer_words.append(tname)
            buffer_players.append(player_name)
        else:
            # Team switch → flush previous run
            if buffer_words:
                sequences.setdefault(current_team, []).append(
                    {"words": buffer_words[:], "players": buffer_players[:]}
                )
            # Start a new run for the new team
            current_team = team_id
            buffer_words = [tname]
            buffer_players = [player_name]

    # Flush last pending run
    if current_team is not None and buffer_words:
        sequences.setdefault(current_team, []).append(
            {"words": buffer_words[:], "players": buffer_players[:]}
        )

    return sequences, id_to_name

def runs_to_df(runs, match_id, team_name):
    """
    Turns a list of runs (each run has 'words' and 'players') into a DataFrame.
    - 'words' are concatenated with '-'
    - 'word_players' mirrors 'words' order and is also concatenated with '-'
    """
    rows = []
    for run in runs:
        words = list(run.get("words") or [])
        players = list(run.get("players") or [])
        if not words:
            continue
        # Ensure equal length; if somehow mismatched, pad players with "Unknown"
        if len(players) < len(words):
            players = players + ["Unknown"] * (len(words) - len(players))
        rows.append({
            "words": "-".join(words),
            "word_players": "-".join(players),
            "match_id": match_id,
            "team_name": team_name,
        })
    return pd.DataFrame(rows, columns=["words", "word_players", "match_id", "team_name"])

def process_match_file(path):
    """
    Processes a single StatsBomb JSON file and returns two DataFrames (one per team).
    Each DF has: words, word_players, match_id, team_name.
    """
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    events = data if isinstance(data, list) else data.get("events", [])
    if not events:
        print(f"Nessun evento trovato in {path}")
        return None, None

    sequences, id_to_name = build_team_sequences(events)
    match_id = parse_match_id_from_filename(path)

    dfs = []
    for team_id, runs in sequences.items():
        team_name = id_to_name.get(team_id, f"Team{team_id}")
        df = runs_to_df(runs, match_id, team_name)
        dfs.append(df)

    if len(dfs) == 2:
        return dfs[0], dfs[1]
    elif len(dfs) == 1:
        return dfs[0], None
    else:
        return None, None

In [7]:
json_path = "Serie_A_24_25_matches/AC Milan_AS Roma_3945135.json"

team1_df, team2_df = process_match_file(json_path)

FileNotFoundError: [Errno 2] No such file or directory: 'Serie_A_24_25_matches/AC Milan_AS Roma_3945135.json'