In [1]:

import os, time, pandas as pd, numpy as np

def to_int(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
    return df

def to_datetime(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")
    return df

def drop_dups(df, subset):
    if all(c in df.columns for c in subset):
        return df.drop_duplicates(subset=subset)
    return df.drop_duplicates()

def validate_fk(df_from, col_from, df_to, col_to):
    if col_from not in df_from.columns or col_to not in df_to.columns:
        return False, f"Columns not found: {col_from} or {col_to}"
    missing = ~df_from[col_from].isin(df_to[col_to])
    return missing.sum() == 0, f"Unmatched rows: {missing.sum()}"

start = time.time()
print("Start: CLEAN -> FINAL (joins first, then advanced cleaning)")

cwd = os.getcwd()
files = {
    "player": "player_clean.csv",
    "team": "team_clean.csv",
    "game": "game_clean.csv",
    "summary": "game_summary_clean.csv",
    "stats": "other_stats_clean.csv"
}

dfs = {}
for k, fname in files.items():
    p = os.path.join(cwd, fname)
    if os.path.exists(p):
        dfs[k] = pd.read_csv(p, low_memory=False)
        print(f"Loaded {fname} with shape {dfs[k].shape}")
    else:
        print(f"Warning: {fname} not found")

player = dfs.get("player", pd.DataFrame())
team = dfs.get("team", pd.DataFrame())
game = dfs.get("game", pd.DataFrame())
summary = dfs.get("summary", pd.DataFrame())
stats = dfs.get("stats", pd.DataFrame())

# Ensure key types for joining
for df, ids in [(player, ["player_id"]), (team, ["team_id"]), (game, ["game_id","home_team_id","visitor_team_id","season"]), (summary, ["game_id","season"]), (stats, ["game_id","player_id","team_id","season"])]:
    if not df.empty:
        to_int(df, [c for c in ids if c in df.columns])
        to_datetime(df, [c for c in df.columns if "date" in c])

# Stage 1: Joins and FK checks
if not game.empty and not team.empty and "team_id" in team.columns:
    for col in ["home_team_id","visitor_team_id"]:
        if col in game.columns:
            ok, msg = validate_fk(game, col, team, "team_id")
            print(f"FK {col} -> team.team_id: {ok}. {msg}")

if not stats.empty and not player.empty and "player_id" in player.columns:
    ok_p, msg_p = validate_fk(stats, "player_id", player, "player_id")
    print(f"FK stats.player_id -> player.player_id: {ok_p}. {msg_p}")
if not stats.empty and not game.empty and "game_id" in game.columns:
    ok_g, msg_g = validate_fk(stats, "game_id", game, "game_id")
    print(f"FK stats.game_id -> game.game_id: {ok_g}. {msg_g}")

if not summary.empty and not game.empty and "game_id" in game.columns:
    ok_s, msg_s = validate_fk(summary, "game_id", game, "game_id")
    print(f"Coverage summary.game_id in game.game_id: {ok_s}. {msg_s}")

# Stage 2: Advanced cleaning
if not team.empty:
    team = drop_dups(team, ["team_id"])
if not player.empty:
    player = drop_dups(player, ["player_id"])
if not game.empty:
    game = drop_dups(game, ["game_id"])
if not stats.empty and all(c in stats.columns for c in ["game_id","player_id"]):
    stats = drop_dups(stats, ["game_id","player_id"])
if not summary.empty:
    summary = drop_dups(summary, ["game_id"])

if not game.empty and "home_team_id" in game.columns and "visitor_team_id" in game.columns:
    bad = game[game["home_team_id"] == game["visitor_team_id"]]
    if not bad.empty:
        print(f"Dropping {len(bad)} rows where home_team_id == visitor_team_id")
        game = game[game["home_team_id"] != game["visitor_team_id"]]

to_int(game, ["game_id","home_team_id","visitor_team_id","season"])
to_int(team, ["team_id"])
to_int(player, ["player_id"])
to_int(stats, ["game_id","player_id","team_id","season"])

# ============================================================
# EXTRA: Correcciones y normalizaciÃ³n (corregida)
# ============================================================
print("Starting deduplication, mapping and normalization for other_stats...")

# 1. Deduplicar other_stats por game_id
if not stats.empty and "game_id" in stats.columns:
    before = len(stats)
    stats = stats.drop_duplicates(subset=["game_id"], keep="first")
    after = len(stats)
    print(f"Removed {before - after} duplicate rows by game_id from other_stats")

# 2. Mapear team IDs si existe un archivo team_xref.csv
xref_path = os.path.join(cwd, "team_xref.csv")
if os.path.exists(xref_path):
    xref = pd.read_csv(xref_path)
    print(f"Loaded team_xref.csv with {xref.shape[0]} mappings")
    mapping = dict(zip(xref["id_source"], xref["id_canonical"]))
    for df_name, df in [("game", game), ("stats", stats)]:
        for col in ["team_id_home", "team_id_away"]:
            if col in df.columns:
                df[col] = df[col].replace(mapping)
                print(f"Applied team ID mapping to {df_name}.{col}")
else:
    print("No team_xref.csv found, skipping team ID mapping step.")

# 3. Normalizar other_stats a nivel equipo-partido (corregido)
home_cols = [c for c in stats.columns if c.endswith("_home")]
away_cols = [c for c in stats.columns if c.endswith("_away")]
base_cols = [c for c in stats.columns if not (c.endswith("_home") or c.endswith("_away"))]

if home_cols and away_cols:
    df_home = stats[base_cols + home_cols].copy()
    df_home.columns = base_cols + [c.replace("_home", "") for c in home_cols]
    df_home["venue"] = "home"
    if "team_id" not in df_home.columns and "team_id_home" in stats.columns:
        df_home["team_id"] = stats["team_id_home"].values

    df_away = stats[base_cols + away_cols].copy()
    df_away.columns = base_cols + [c.replace("_away", "") for c in away_cols]
    df_away["venue"] = "away"
    if "team_id" not in df_away.columns and "team_id_away" in stats.columns:
        df_away["team_id"] = stats["team_id_away"].values

    fact_team_game = pd.concat([df_home, df_away], ignore_index=True)
    assert "game_id" in fact_team_game.columns, "game_id missing after normalization"
    assert "team_id" in fact_team_game.columns, "team_id missing after normalization"
    fact_team_game = fact_team_game.dropna(subset=["team_id","game_id"])
    fact_team_game["team_id"] = pd.to_numeric(fact_team_game["team_id"], errors="coerce").astype("Int64")
    fact_team_game["game_id"] = pd.to_numeric(fact_team_game["game_id"], errors="coerce").astype("Int64")

    out_team_game = os.path.join(cwd, "fact_team_game_final.csv")
    fact_team_game.to_csv(out_team_game, index=False, encoding="utf-8-sig")
    print(f"fact_team_game_final.csv created with {fact_team_game.shape[0]} rows, {fact_team_game.shape[1]} cols")
else:
    print("No *_home / *_away columns detected, skipping normalization.")

# ============================================================
# Export final tables
# ============================================================
outputs = {
    "team_final.csv": team,
    "player_final.csv": player,
    "game_final.csv": game,
    "game_summary_final.csv": summary,
    "other_stats_final.csv": stats
}
for fname, df in outputs.items():
    outp = os.path.join(cwd, fname)
    df.to_csv(outp, index=False, encoding="utf-8-sig")
    print(f"Wrote {fname} with shape {df.shape}")

print("Primary keys prepared: dim_team(team_id), dim_player(player_id), fact_game(game_id), fact_team_game(game_id, team_id)")
print("Foreign keys validated: game.home_team_id -> team.team_id, game.visitor_team_id -> team.team_id, stats.game_id -> game.game_id, stats.player_id -> player.player_id")

end = time.time()
print(f"Completed CLEAN -> FINAL in {end - start:.2f} seconds")

Start: CLEAN -> FINAL (joins first, then advanced cleaning)
Loaded player_clean.csv with shape (4831, 5)
Loaded team_clean.csv with shape (30, 7)
Loaded game_clean.csv with shape (65698, 55)
Loaded game_summary_clean.csv with shape (58110, 14)
Loaded other_stats_clean.csv with shape (28271, 26)
FK stats.game_id -> game.game_id: True. Unmatched rows: 0
Coverage summary.game_id in game.game_id: True. Unmatched rows: 0
Starting deduplication, mapping and normalization for other_stats...
Removed 10 duplicate rows by game_id from other_stats
No team_xref.csv found, skipping team ID mapping step.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")


fact_team_game_final.csv created with 56522 rows, 16 cols
Wrote team_final.csv with shape (30, 7)
Wrote player_final.csv with shape (4831, 5)
Wrote game_final.csv with shape (65642, 55)
Wrote game_summary_final.csv with shape (58021, 14)
Wrote other_stats_final.csv with shape (28261, 26)
Primary keys prepared: dim_team(team_id), dim_player(player_id), fact_game(game_id), fact_team_game(game_id, team_id)
Foreign keys validated: game.home_team_id -> team.team_id, game.visitor_team_id -> team.team_id, stats.game_id -> game.game_id, stats.player_id -> player.player_id
Completed CLEAN -> FINAL in 4.20 seconds
