In [None]:
#Importar librerias
import pandas as pd
from datetime import datetime

# Cargar datasets
player = pd.read_csv("data/raw/player.csv")
common = pd.read_csv("data/raw/common_player_info.csv")

# Unir tablas
player_enriched = player.merge(
    common,
    how='left',
    left_on='id',
    right_on='person_id'
)

In [None]:
#Eliminar columnas redundantes de la nueva tabla 
cols_drop = [
    'first_name','last_name','display_first_last','display_last_comma_first',
    'display_fi_last','player_slug','last_affiliation','team_code',
    'team_abbreviation','playercode','games_played_current_season_flag'
]

player_enriched.drop(columns=cols_drop, inplace=True, errors='ignore')

In [None]:
# 1.3 Crear columna edad
player_enriched['birthdate'] = pd.to_datetime(player_enriched['birthdate'], errors='coerce')
player_enriched['edad'] = (datetime.now() - player_enriched['birthdate']).dt.days // 365

# 1.4 Crear columna "a√±os_jugando"
player_enriched['a√±os_jugando'] = player_enriched['to_year'] - player_enriched['from_year']

# 1.5 Crear columna "undrafted"
player_enriched['undrafted'] = player_enriched['draft_year'].isna().astype(int)

# 1.6 Eliminar birthdate por redundancia
player_enriched = player_enriched.drop(columns=['birthdate'], errors='ignore')

# Exportar tabla limpia
player_enriched.columns = player_enriched.columns.str.lower()
player_enriched.to_csv("data/clean/player_enriched.csv", index=False)
print("‚úÖ player_enriched creado con columnas limpias y calculadas")

In [None]:
# --- Tabla TEAM ---
team = pd.read_csv("data/raw/team.csv")
team = team.drop(columns=['abbreviation'], errors='ignore')
team.columns = team.columns.str.lower()
team.to_csv("data/clean/team.csv", index=False)

# --- Tabla OTHER_STATS ---
other_stats = pd.read_csv("data/raw/other_stats.csv")
other_stats = other_stats.drop(columns=['league_id','team_abbreviation_home'], errors='ignore')
other_stats.columns = other_stats.columns.str.lower()
other_stats.to_csv("data/clean/other_stats.csv", index=False)

# --- Tabla GAME_SUMMARY ---
game_summary = pd.read_csv("data/raw/game_summary.csv")
game_summary = game_summary.drop(columns=['wh_status'], errors='ignore')
game_summary.columns = game_summary.columns.str.lower()
game_summary.to_csv("data/clean/game_summary.csv", index=False)

print("‚úÖ Tablas auxiliares limpiadas correctamente")

In [None]:
#Filtramos jugadorees de NBA 
player_enriched = pd.read_csv("data/clean/player_enriched.csv")

# Mantener solo jugadores con experiencia NBA
player_enrichedF = player_enriched[player_enriched['nba_flag'] == 1]

player_enrichedF.to_csv("data/clean/player_enrichedF.csv", index=False)
print("‚úÖ Jugadores filtrados (solo con nba_flag = 1)")

In [None]:
# --- GAME + TEAM -> Tabla para an√°lisis  ---
game = pd.read_csv("data/raw/game.csv")
team = pd.read_csv("data/clean/team.csv")

game_team = game.merge(team, how='left', left_on='team_id_home', right_on='id')
game_team.columns = game_team.columns.str.lower()
game_team.to_csv("data/clean/game_team.csv", index=False)
print("‚úÖ game_team creado correctamente")

# --- GAME_TEAM + OTHER_STATS -> Tabla para an√°lisis ---
other_stats = pd.read_csv("data/clean/other_stats.csv")

game_team_other_stats = game_team.merge(
    other_stats,
    how='left',
    on=['game_id','game_id']
)
game_team_other_stats.columns = game_team_other_stats.columns.str.lower()
game_team_other_stats.to_csv("data/clean/game_team_other_stats.csv", index=False)

print("‚úÖ game_team_other_stats creado correctamente")

In [None]:
#Tratamiento de Nulos 

import pandas as pd, os

# Ruta base
path = "data/clean/"
files = [
    "game_team.csv",
    "game_summary.csv",
    "other_stats.csv",
    "player_enriched.csv",
    "team.csv",
    "game_team_other_stats.csv",
    "game.csv"
]

# ------------------------------------------------------
# Funci√≥n general de limpieza de nulos
# ------------------------------------------------------
def limpiar_nulos(df):
    # 1. Eliminar columnas con m√°s del 80% de nulos
    df = df.loc[:, df.isna().mean() < 0.8]

    # 2. Identificadores cr√≠ticos ‚Üí eliminar filas con nulos
    id_cols = [c for c in df.columns if 'id' in c]
    if id_cols:
        df = df.dropna(subset=id_cols)

    # 3. Num√©ricos ‚Üí rellenar con mediana o 0
    num_cols = df.select_dtypes(include='number').columns
    for col in num_cols:
        if df[col].isna().mean() > 0:
            if df[col].isna().mean() < 0.3:
                df[col] = df[col].fillna(df[col].median())
            else:
                df[col] = df[col].fillna(0)

    # 4. Categ√≥ricos ‚Üí ‚ÄúDesconocido‚Äù
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = df[col].fillna('Desconocido')

    return df

# ------------------------------------------------------
# Bucle de carga y limpieza de todas las tablas
# ------------------------------------------------------
for f in files:
    file_path = os.path.join(path, f)

    # Lectura robusta: tipificar columnas problem√°ticas
    if "game" in f.lower() or "stats" in f.lower():
        df = pd.read_csv(
            file_path,
            dtype={
                'team_id_home': 'Int64',
                'team_id_away': 'Int64'
            },
            low_memory=False
        )
    else:
        df = pd.read_csv(file_path, low_memory=False)

    # Aplicar limpieza
    df = limpiar_nulos(df)

    # Guardar resultado limpio
    out_path = os.path.join("data/final", f.replace('.csv', '_clean.csv'))
    df.to_csv(out_path, index=False)

    print(f"‚úÖ {f} limpiado ‚Üí {out_path}")

In [None]:
import pandas as pd
import os

path = "data/final/"

# Lista de archivos con los que est√°s trabajando
files = [
    "game_clean.csv",
    "game_team_clean.csv",
    "game_summary_clean.csv",
    "other_stats_clean.csv",
    "player_enriched_clean.csv",
    "team_clean.csv",
    "game_team_other_stats_clean.csv"
]

# Funci√≥n para imprimir estad√≠sticas descriptivas columna por columna
def mostrar_estadisticas(df, tabla):
    print(f"\n{'='*70}")
    print(f"üìä ESTAD√çSTICAS DESCRIPTIVAS ‚Äì {tabla.upper()}")
    print(f"{'='*70}")

    # Seleccionar solo columnas num√©ricas
    num_cols = df.select_dtypes(include='number').columns

    if len(num_cols) == 0:
        print("‚ö†Ô∏è No hay columnas num√©ricas en esta tabla.")
        return

    # For loop completo: muestra describe() de todas las num√©ricas
    for col in num_cols:
        print(f"\n‚û°Ô∏è Columna: {col}")
        print(df[col].describe())

    # While loop de ejemplo: solo primeras 5 columnas num√©ricas
    print(f"\n{'-'*50}\nPrimeras 5 columnas num√©ricas (while loop):")
    i = 0
    while i < min(5, len(num_cols)):
        col = num_cols[i]
        print(f"\nüîπ Columna (while): {col}")
        print(df[col].describe())
        i += 1

# Ejecutar el an√°lisis sobre cada archivo
for f in files:
    df = pd.read_csv(os.path.join(path, f))
    mostrar_estadisticas(df, f)


REVISAR NULOS


In [None]:
#REVISION DE NULOS RESULTANTES

import pandas as pd
import os

path = "data/final/"
files = [ 
    "game_team_clean.csv",
    "game_summary_clean.csv",
    "other_stats_clean.csv",
    "player_enriched_clean.csv",
    "team_clean.csv",
    "game_team_other_stats_clean.csv",
    "game_clean.csv"
]

for f in files:
    df = pd.read_csv(os.path.join(path, f))
    print(f"\nüìä --- {f} ---")
    print("Filas:", len(df))
    print("Porcentaje de nulos por columna:")
    print((df.isna().mean() * 100).round(2).sort_values(ascending=False).head(10000))

In [None]:
print(player_enriched.columns)
