In [13]:
import os, re, json
import pandas as pd
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
pd.set_option("display.max_colwidth",None)
from pathlib import Path
from glob import glob
from pathlib import Path
from tqdm import tqdm

In [7]:
# ----------------------------
# CONFIG
# ----------------------------
SKIP_TYPES = {"Half End", "Half Start", "Referee Ball Drop", "Starting XI"}

# ----------------------------
# Helpers
# ----------------------------
def parse_match_id_from_filename(path: str) -> str:
    m = re.search(r"_([0-9]+)\.json$", os.path.basename(str(path)))
    return m.group(1) if m else "unknown"

def norm_type_name(tname: str) -> str:
    return "Ball Receipt" if tname == "Ball Receipt*" else tname

# ----------------------------
# Core logic
# ----------------------------
def build_possession_sequences(events):
    """
    Build sequences segmented by ev["possession"].
    Keep ONLY events where team.id == possession_team.id.
    """
    sequences = {}
    id_to_name = {}

    current_possession = None
    current_team_id = None
    buffer_words, buffer_players = [], []

    def flush_run():
        nonlocal buffer_words, buffer_players, current_possession, current_team_id
        if not buffer_words or current_team_id is None or current_possession is None:
            return
        sequences.setdefault(current_team_id, []).append({
            "words": buffer_words[:],
            "players": buffer_players[:],
            "possession": int(current_possession),
        })
        buffer_words, buffer_players = [], []

    for ev in events:
        poss_num = ev.get("possession")
        if poss_num is None:
            continue

        poss_team = ev.get("possession_team") or {}
        team = ev.get("team") or {}

        poss_team_id = poss_team.get("id")
        poss_team_name = poss_team.get("name")
        team_id = team.get("id")
        team_name = team.get("name")

        if poss_team_id is None or poss_team_name is None or team_id is None or team_name is None:
            continue

        # FILTER: keep only events attributed to the team in possession
        if team_id != poss_team_id:
            continue

        id_to_name[poss_team_id] = poss_team_name

        t = ev.get("type")
        tname = t.get("name") if isinstance(t, dict) else None
        if not tname:
            continue
        tname = norm_type_name(tname)

        if tname in SKIP_TYPES:
            continue

        player_name = (ev.get("player") or {}).get("name") or "Unknown"

        # start
        if current_possession is None:
            current_possession = poss_num
            current_team_id = poss_team_id
            buffer_words.append(tname)
            buffer_players.append(player_name)
            continue

        # possession (or team) switch
        if poss_num != current_possession or poss_team_id != current_team_id:
            flush_run()
            current_possession = poss_num
            current_team_id = poss_team_id

        buffer_words.append(tname)
        buffer_players.append(player_name)

    flush_run()
    return sequences, id_to_name

def runs_to_df(runs, match_id, team_name):
    rows = []
    for run in runs:
        words = run["words"]
        players = run["players"]
        poss_num = run["possession"]

        if not words:
            continue

        if len(players) < len(words):
            players = players + ["Unknown"] * (len(words) - len(players))

        rows.append({
            "words": "-".join(words),
            "possession": poss_num,        # üëà colonna separata
            "word_players": "-".join(players),
            "match_id": match_id,
            "team_name": team_name,
        })

    return pd.DataFrame(
        rows,
        columns=["words", "possession", "word_players", "match_id", "team_name"]
    )

def process_match_file(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    events = data if isinstance(data, list) else data.get("events", [])
    if not events:
        print(f"Nessun evento trovato in {path}")
        return None, None

    sequences, id_to_name = build_possession_sequences(events)
    match_id = parse_match_id_from_filename(path)

    dfs = []
    for team_id, runs in sequences.items():
        team_name = id_to_name.get(team_id, f"Team{team_id}")
        dfs.append(runs_to_df(runs, match_id, team_name))

    if len(dfs) == 2:
        return dfs[0], dfs[1]
    elif len(dfs) == 1:
        return dfs[0], None
    else:
        return None, None


In [8]:
df_team1, df_team2 = process_match_file("AC Milan_AS Roma_3945135.json")

In [9]:
df_team1

Unnamed: 0,words,possession,word_players,match_id,team_name
0,Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Pass-Ball Receipt,2,Paulo Bruno Exequiel Dybala-Leandro Daniel Paredes-Leandro Daniel Paredes-Leandro Daniel Paredes-Gianluca Mancini-Gianluca Mancini-Gianluca Mancini-Alexis Saelemaekers-Alexis Saelemaekers-Alexis Saelemaekers-Leandro Daniel Paredes-Leandro Daniel Paredes-Paulo Bruno Exequiel Dybala,3945135,AS Roma
1,Pass-Ball Receipt-Carry-Pass-Ball Receipt,3,Alexis Saelemaekers-Leandro Daniel Paredes-Leandro Daniel Paredes-Leandro Daniel Paredes-Alexis Saelemaekers,3945135,AS Roma
2,Pass-Ball Receipt-Carry-Foul Won-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Pressure-Ball Receipt,6,Leandro Daniel Paredes-Alexis Saelemaekers-Alexis Saelemaekers-Alexis Saelemaekers-Alexis Saelemaekers-Leandro Daniel Paredes-Leandro Daniel Paredes-Leandro Daniel Paredes-Paulo Bruno Exequiel Dybala-Paulo Bruno Exequiel Dybala-Paulo Bruno Exequiel Dybala-Gianluca Mancini-Gianluca Mancini-Gianluca Mancini-Mile Svilar-Mile Svilar-Obite Evan Ndicka-Obite Evan Ndicka-Obite Evan Ndicka-Jos√© √Ångel Esmoris Tasende-Jos√© √Ångel Esmoris Tasende-Jos√© √Ångel Esmoris Tasende-Niccol√≤ Pisilli-Niccol√≤ Pisilli,3945135,AS Roma
3,Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Shot,7,Jos√© √Ångel Esmoris Tasende-Obite Evan Ndicka-Obite Evan Ndicka-Obite Evan Ndicka-Mile Svilar-Mile Svilar-Mile Svilar-Leandro Daniel Paredes-Leandro Daniel Paredes-Leandro Daniel Paredes-Obite Evan Ndicka-Obite Evan Ndicka-Obite Evan Ndicka-Niccol√≤ Pisilli-Niccol√≤ Pisilli-Niccol√≤ Pisilli-Obite Evan Ndicka-Obite Evan Ndicka-Obite Evan Ndicka-Kouadio Emmanuel Kon√©-Kouadio Emmanuel Kon√©-Kouadio Emmanuel Kon√©-Obite Evan Ndicka-Obite Evan Ndicka-Obite Evan Ndicka-Leandro Daniel Paredes-Leandro Daniel Paredes-Leandro Daniel Paredes-Obite Evan Ndicka-Obite Evan Ndicka-Obite Evan Ndicka-Jos√© √Ångel Esmoris Tasende-Jos√© √Ångel Esmoris Tasende-Jos√© √Ångel Esmoris Tasende-Paulo Bruno Exequiel Dybala-Paulo Bruno Exequiel Dybala,3945135,AS Roma
4,Pass-Ball Receipt-Pass-Ball Receipt-Pass-Ball Receipt,9,Obite Evan Ndicka-Leandro Daniel Paredes-Leandro Daniel Paredes-Niccol√≤ Pisilli-Niccol√≤ Pisilli-Jos√© √Ångel Esmoris Tasende,3945135,AS Roma
5,Goal Keeper-Pass-Ball Receipt-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt,12,Mile Svilar-Mile Svilar-Jos√© √Ångel Esmoris Tasende-Jos√© √Ångel Esmoris Tasende-Kouadio Emmanuel Kon√©-Kouadio Emmanuel Kon√©-Kouadio Emmanuel Kon√©-Gianluca Mancini-Gianluca Mancini-Gianluca Mancini-Leandro Daniel Paredes-Leandro Daniel Paredes-Leandro Daniel Paredes-Mats Hummels-Mats Hummels-Mats Hummels-Niccol√≤ Pisilli,3945135,AS Roma
6,Block-Ball Recovery-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Foul Won,14,Mats Hummels-Niccol√≤ Pisilli-Niccol√≤ Pisilli-Niccol√≤ Pisilli-Kouadio Emmanuel Kon√©-Kouadio Emmanuel Kon√©-Kouadio Emmanuel Kon√©-Paulo Bruno Exequiel Dybala-Paulo Bruno Exequiel Dybala-Paulo Bruno Exequiel Dybala-Alexis Saelemaekers-Alexis Saelemaekers-Alexis Saelemaekers,3945135,AS Roma
7,Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Carry-Pass-Ball Receipt-Shot,15,Paulo Bruno Exequiel Dybala-Leandro Daniel Paredes-Leandro Daniel Paredes-Leandro Daniel Paredes-Alexis Saelemaekers-Alexis Saelemaekers-Alexis Saelemaekers-Paulo Bruno Exequiel Dybala-Paulo Bruno Exequiel Dybala-Paulo Bruno Exequiel Dybala-Mats Hummels-Mats Hummels,3945135,AS Roma
8,Ball Recovery-Carry-Dispossessed,17,Kouadio Emmanuel Kon√©-Kouadio Emmanuel Kon√©-Kouadio Emmanuel Kon√©,3945135,AS Roma
9,Interception-Carry-Pass-Ball Receipt,20,Jos√© √Ångel Esmoris Tasende-Jos√© √Ångel Esmoris Tasende-Jos√© √Ångel Esmoris Tasende-Niccol√≤ Pisilli,3945135,AS Roma


In [19]:
SKIP_TYPES = {"Half End", "Half Start", "Referee Ball Drop", "Starting XI"}

INPUT_DIR  = Path.home() / "Desktop" / "Podemi_and_Football-main" / "Bundesliga_24_25_matches"
OUTPUT_DIR = Path.home() / "Desktop" / "Podemi_and_Football-main" / "Linguistic" / "Bundesliga_24_25_possession_texts"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

WORDS_COL = "words"

# ----------------------------
# Helpers
# ----------------------------
def safe_key(s: str) -> str:
    return re.sub(r"\W+", "_", (s or "").strip()).strip("_")

def parse_match_id_from_filename(path: str) -> str:
    # prova prima pattern ..._12345.json
    m = re.search(r"_([0-9]+)\.json$", os.path.basename(str(path)))
    if m:
        return m.group(1)
    # fallback: prendi cifre pi√π lunghe nel nome
    m2 = re.search(r"([0-9]{5,})", os.path.basename(str(path)))
    return m2.group(1) if m2 else "unknown"

def norm_type_name(tname: str) -> str:
    return "Ball Receipt" if tname == "Ball Receipt*" else tname

# ----------------------------
# Core logic
# ----------------------------
def build_possession_sequences(events):
    """
    Build sequences segmented by ev["possession"] (the integer).
    Keep ONLY events where team.id == possession_team.id.
    """
    sequences = {}
    id_to_name = {}

    current_possession = None
    current_team_id = None
    buffer_words, buffer_players = [], []

    def flush_run():
        nonlocal buffer_words, buffer_players, current_possession, current_team_id
        if not buffer_words or current_team_id is None or current_possession is None:
            return
        sequences.setdefault(current_team_id, []).append({
            "words": buffer_words[:],
            "players": buffer_players[:],
            "possession": int(current_possession),
        })
        buffer_words, buffer_players = [], []

    for ev in events:
        poss_num = ev.get("possession")
        if poss_num is None:
            continue

        poss_team = ev.get("possession_team") or {}
        team = ev.get("team") or {}

        poss_team_id = poss_team.get("id")
        poss_team_name = poss_team.get("name")
        team_id = team.get("id")
        team_name = team.get("name")

        if poss_team_id is None or poss_team_name is None or team_id is None or team_name is None:
            continue

        # FILTER: keep only events attributed to the team in possession
        if team_id != poss_team_id:
            continue

        id_to_name[poss_team_id] = poss_team_name

        t = ev.get("type")
        tname = t.get("name") if isinstance(t, dict) else None
        if not tname:
            continue
        tname = norm_type_name(tname)

        if tname in SKIP_TYPES:
            continue

        player_name = (ev.get("player") or {}).get("name") or "Unknown"

        # start
        if current_possession is None:
            current_possession = poss_num
            current_team_id = poss_team_id
            buffer_words.append(tname)
            buffer_players.append(player_name)
            continue

        # possession (or team) switch
        if poss_num != current_possession or poss_team_id != current_team_id:
            flush_run()
            current_possession = poss_num
            current_team_id = poss_team_id

        buffer_words.append(tname)
        buffer_players.append(player_name)

    flush_run()
    return sequences, id_to_name

def runs_to_df(runs, match_id, team_name):
    rows = []
    for run in runs:
        words = run["words"]
        players = run["players"]
        poss_num = run["possession"]

        if not words:
            continue

        if len(players) < len(words):
            players = players + ["Unknown"] * (len(words) - len(players))

        rows.append({
            "words": "-".join(words),
            "possession": poss_num,
            "word_players": "-".join(players),
            "match_id": match_id,
            "team_name": team_name,
        })

    return pd.DataFrame(rows, columns=["words", "possession", "word_players", "match_id", "team_name"])

def process_match_file(path: Path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    events = data if isinstance(data, list) else data.get("events", [])
    if not events:
        return None, None

    sequences, id_to_name = build_possession_sequences(events)
    match_id = parse_match_id_from_filename(str(path))

    dfs = []
    for team_id, runs in sequences.items():
        team_name = id_to_name.get(team_id, f"Team{team_id}")
        dfs.append(runs_to_df(runs, match_id, team_name))

    if len(dfs) == 2:
        return dfs[0], dfs[1]
    elif len(dfs) == 1:
        return dfs[0], None
    else:
        return None, None

# ----------------------------
# Save helper
# ----------------------------
def save_team_df(df: pd.DataFrame) -> bool:
    """Salva df in OUTPUT_DIR come TEAM_MATCHID.csv"""
    if df is None or df.empty:
        return False

    team_name = str(df.iloc[0]["team_name"])
    match_id  = str(df.iloc[0]["match_id"])

    out_name = f"{safe_key(team_name)}_{match_id}.csv"
    out_path = OUTPUT_DIR / out_name

    df.to_csv(out_path, index=False, encoding="utf-8")
    return True

# ----------------------------
# RUN batch
# ----------------------------
json_files = sorted(INPUT_DIR.glob("*.json"))

print("CWD (notebook):", Path.cwd())
print("INPUT_DIR:", INPUT_DIR, "exists:", INPUT_DIR.exists())
print("OUTPUT_DIR:", OUTPUT_DIR, "exists:", OUTPUT_DIR.exists())
print("Found JSON files:", len(json_files))

processed = saved = skipped = 0

for path in tqdm(json_files, desc="Processing matches", unit="file"):
    try:
        df1, df2 = process_match_file(path)
        processed += 1

        ok1 = save_team_df(df1)
        ok2 = save_team_df(df2)
        
        if ok1: saved += 1
        if ok2: saved += 1
        if not ok1 and not ok2:
            skipped += 1
            tqdm.write(f"‚ö†Ô∏è  Nessun DF valido per: {path.name}")

    except Exception as e:
        skipped += 1
        tqdm.write(f"‚ùå Errore su {path.name}: {e}")

print("\n‚úÖ FATTO")
print(f"JSON processati: {processed}")
print(f"CSV salvati    : {saved}")
print(f"File saltati   : {skipped}")
print("Output in:", OUTPUT_DIR)

CWD (notebook): /Users/lucasantagata/Desktop/Podemi_and_Football-main/Linguistic
INPUT_DIR: /Users/lucasantagata/Desktop/Podemi_and_Football-main/Bundesliga_24_25_matches exists: True
OUTPUT_DIR: /Users/lucasantagata/Desktop/Podemi_and_Football-main/Linguistic/Bundesliga_24_25_possession_texts exists: True
Found JSON files: 306


Processing matches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 306/306 [00:06<00:00, 45.70file/s]


‚úÖ FATTO
JSON processati: 306
CSV salvati    : 612
File saltati   : 0
Output in: /Users/lucasantagata/Desktop/Podemi_and_Football-main/Linguistic/Bundesliga_24_25_possession_texts



