In [None]:
import json
import glob
import os

athletes_files = glob.glob(os.path.join(data_dir, "*.athletes.json"))
issues_files = glob.glob(os.path.join(data_dir, "*.issues.json"))

all_athletes = {}
all_issues = {}

def get_race_id(filepath):
    filename = os.path.basename(filepath)
    return filename.split(".")[0]

for a_file in athletes_files:
    race_id = get_race_id(a_file)
    with open(a_file, "r", encoding="utf-8") as f:
        all_athletes[race_id] = json.load(f)

for i_file in issues_files:
    race_id = get_race_id(i_file)
    with open(i_file, "r", encoding="utf-8") as f:
        all_issues[race_id] = json.load(f)

print("Carreras cargadas:", list(all_athletes.keys()))
print("Ejemplo atletas primera carrera:", all_athletes[list(all_athletes.keys())[0]][:3])
print("Ejemplo issues primera carrera:", all_issues[list(all_issues.keys())[0]][:3])

In [None]:
import os
import glob
import json
from collections import defaultdict


def merge_structures(base, new, max_unique=0):

    if isinstance(base, dict) and isinstance(new, dict):
        # si ambos son diccionarios, combinar claves recursivamente
        merged = dict(base)
        for key, val in new.items():
            if key in merged:
                merged[key] = merge_structures(merged[key], val, max_unique)
            else:
                merged[key] = val
        return merged

    elif isinstance(base, set) and isinstance(new, set):
        # si ambos son conjuntos, unir y limitar tamaño si es necesario
        merged_set = base | new
        if len(merged_set) > max_unique:
            return "*"
        return merged_set

    elif isinstance(base, set):
        # caso mixto: base es un set y new no
        if isinstance(new, (dict, list)):
            return "*|dict_or_list"
        return merge_structures(base, {new}, max_unique)

    elif isinstance(new, set):
        # caso mixto inverso: new es set y base no
        if isinstance(base, (dict, list)):
            return "*|dict_or_list"
        return merge_structures({base}, new, max_unique)

    else:
        # caso base: ambos son valores escalares (int, str, etc.)
        if isinstance(base, (dict, list)) or isinstance(new, (dict, list)):
            return "*|dict_or_list"
        return {base, new}


def extract_structure_unique(data):
    
    if isinstance(data, dict):
        return {k: extract_structure_unique(v) for k, v in data.items()}

    elif isinstance(data, list):
        if not data:
            return {"[]": {}}  # lista vacía, estructura vacía
        # empezamos con la estructura del primer elemento
        struct = extract_structure_unique(data[0])
        # combinamos estructuras del resto de elementos
        for item in data[1:]:
            struct = merge_structures(struct, extract_structure_unique(item))
        return {"[]": struct}

    else:
        # valor escalar (str, int, bool, None, etc.)
        return {data}


def sets_to_lists(obj):
    
    if isinstance(obj, dict):
        return {k: sets_to_lists(v) for k, v in obj.items()}
    elif isinstance(obj, set):
        return [sets_to_lists(v) for v in obj]
    elif isinstance(obj, list):
        return [sets_to_lists(v) for v in obj]
    else:
        return obj



athletes_files = glob.glob(os.path.join(data_dir, "*.athletes.json"))

output_dir = os.path.join(data_dir, "estructuras")
os.makedirs(output_dir, exist_ok=True)

def get_race_id(filepath):
    """Extrae el ID de carrera a partir del nombre del archivo."""
    return os.path.basename(filepath).split(".")[0]


for a_file in athletes_files:
    race_id = get_race_id(a_file)
    print(f"\nProcesando carrera: {race_id}")

    # procesamiento de ATHLETES
    with open(a_file, "r", encoding="utf-8") as f:
        athletes = json.load(f)

    structure = {}
    # fusionar estructuras de todos los registros
    for record in athletes:
        structure = merge_structures(structure, extract_structure_unique(record))

    structure_serializable = sets_to_lists(structure)

    # guardar estructura en archivo JSON
    output_path_ath = os.path.join(output_dir, f"{race_id}_structure_athletes.json")
    with open(output_path_ath, "w", encoding="utf-8") as f:
        json.dump(structure_serializable, f, indent=4, ensure_ascii=False)
    print(f"Estructura ATHLETES guardada en: {output_path_ath}")

    # procesamiento de issues
    i_file = a_file.replace(".athletes.json", ".issues.json")

    if os.path.exists(i_file):
        with open(i_file, "r", encoding="utf-8") as f:
            issues = json.load(f)

        structure_issues = {}
        for record in issues:
            structure_issues = merge_structures(structure_issues, extract_structure_unique(record))

        structure_issues_serializable = sets_to_lists(structure_issues)

        output_path_iss = os.path.join(output_dir, f"{race_id}_structure_issues.json")
        with open(output_path_iss, "w", encoding="utf-8") as f:
            json.dump(structure_issues_serializable, f, indent=4, ensure_ascii=False)

        print(f"Estructura ISSUES guardada en: {output_path_iss}")
    else:
        print("No se encontró archivo ISSUES para esta carrera.")


In [None]:
## CAMBIARLO PARA QUE SE UED HACER POR CARRERA TAMBIEN

import json
import os
import glob

ATHLETE_ID = 'L6554352'  # cambia por el ID del atleta que quieras

# obtener archivos
athletes_files = glob.glob(os.path.join(data_dir, "*.athletes.json"))
issues_files = glob.glob(os.path.join(data_dir, "*.issues.json"))

if not athletes_files:
    raise FileNotFoundError("No se encontró ningún archivo de atletas en el directorio.")
if not issues_files:
    raise FileNotFoundError("No se encontró ningún archivo de issues en el directorio.")

# carga de datos
for fpath in athletes_files:
    with open(fpath, "r", encoding="utf-8") as f:
        athletes_data.extend(json.load(f))  # agregamos todos los atletas de cada archivo

issues_data = []
for fpath in issues_files:
    with open(fpath, "r", encoding="utf-8") as f:
        issues_data.extend(json.load(f))  # agregamos todas las listas de issues

#buscarlo
athlete = next((a for a in athletes_data if str(a.get("id")) == str(ATHLETE_ID)), None)
if athlete is None:
    raise ValueError(f"No se encontró ningún atleta con id {ATHLETE_ID}")

#busar issues
for issue_list in issues_data:
    # cada issue_list puede ser una lista de issues
    if isinstance(issue_list, list):
        for issue in issue_list:
            if str(issue.get("athlete_id")) == str(ATHLETE_ID) or issue.get("athlete_id") == athlete.get("id"):
                athlete_issues.append(issue)
    elif isinstance(issue_list, dict):
        # si algún archivo tiene un dict en lugar de lista
        if str(issue_list.get("athlete_id")) == str(ATHLETE_ID):
            athlete_issues.append(issue_list)

output_athlete = os.path.join(data_dir, f"athlete_{ATHLETE_ID}.json")
output_issues = os.path.join(data_dir, f"athlete_{ATHLETE_ID}_issues.json")

with open(output_athlete, "w", encoding="utf-8") as f:
    json.dump(athlete, f, ensure_ascii=False, indent=4)

with open(output_issues, "w", encoding="utf-8") as f:
    json.dump(athlete_issues, f, ensure_ascii=False, indent=4)

print(f"Archivos guardados:\n - {output_athlete}\n - {output_issues}")

In [None]:
athletes_time_df = dfs['athletes_time_df']

import pandas as pd
import numpy as np

numeric_pairs = [
    ("netTime", "time"),
    ("originalTime", "raw_times_official"),
    ("originalTime", "raw_times_real"),
    ("raw_backupOffset", "offset"),
    ("raw_times_official", "raw_times_real"),
    ("raw_times_rawTime", "raw_rawTime"),
]

for col1, col2 in numeric_pairs:
    df_pair = athletes_time_df[[col1, col2]].dropna()
    exact_match = (df_pair[col1] == df_pair[col2]).all()
    diff_mean = (df_pair[col1] - df_pair[col2]).abs().mean()
    print(f"{col1} vs {col2}: Exact match? {exact_match}, Mean absolute difference: {diff_mean}")

datetime_pairs = [
    ("rawTime", "raw_rawTime"),
    ("raw_originalTime", "raw_times_rawTime"),
]

for col1, col2 in datetime_pairs:
    df_pair = athletes_time_df[[col1, col2]].dropna()
    exact_match = (df_pair[col1] == df_pair[col2]).all()
    diff_mean = (df_pair[col1] - df_pair[col2]).abs().mean()
    print(f"{col1} vs {col2}: Exact match? {exact_match}, Mean absolute difference: {diff_mean}")

In [None]:
import os

output_folder = r"C:\Users\mario\Desktop\MasterCienciadeDatos\TFM\TFM_MarioSoto\raiz\datos\historicos\calcular_peso"
os.makedirs(output_folder, exist_ok=True)

for name, df in dfs_final.items():
    print(f"Guardando {name}...")

    parquet_path = os.path.join(output_folder, f"{name}.parquet")
    df.to_parquet(parquet_path, index=False)
    
    csv_path = os.path.join(output_folder, f"{name}.csv.gz")
    df.to_csv(csv_path, index=False, compression='gzip')

print("\n✅ Todos los DataFrames se han guardado correctamente en:")
print(output_folder)