In [66]:
#Importar librerias
import pandas as pd
from datetime import datetime

# Cargar datasets
player = pd.read_csv("data/raw/player.csv")
common = pd.read_csv("data/raw/common_player_info.csv")

# Unir tablas
player_enriched = player.merge(
    common,
    how='left',
    left_on='id',
    right_on='person_id'
)

In [67]:
#Eliminar columnas redundantes de la nueva tabla 
cols_drop = [
    'first_name','last_name','display_first_last','display_last_comma_first',
    'display_fi_last','player_slug','last_affiliation','team_code',
    'team_abbreviation','playercode','games_played_current_season_flag'
]

player_enriched.drop(columns=cols_drop, inplace=True, errors='ignore')

In [68]:
# 1.3 Crear columna edad
player_enriched['birthdate'] = pd.to_datetime(player_enriched['birthdate'], errors='coerce')
player_enriched['edad'] = (datetime.now() - player_enriched['birthdate']).dt.days // 365

# 1.4 Crear columna "años_jugando"
player_enriched['años_jugando'] = player_enriched['to_year'] - player_enriched['from_year']

# 1.5 Crear columna "undrafted"
player_enriched['undrafted'] = player_enriched['draft_year'].isna().astype(int)

# 1.6 Eliminar birthdate por redundancia
player_enriched = player_enriched.drop(columns=['birthdate'], errors='ignore')

# Exportar tabla limpia
player_enriched.columns = player_enriched.columns.str.lower()
player_enriched.to_csv("data/clean/player_enriched.csv", index=False)
print("✅ player_enriched creado con columnas limpias y calculadas")

✅ player_enriched creado con columnas limpias y calculadas


In [69]:
# --- Tabla TEAM ---
team = pd.read_csv("data/raw/team.csv")
team = team.drop(columns=['abbreviation'], errors='ignore')
team.columns = team.columns.str.lower()
team.to_csv("data/clean/team.csv", index=False)

# --- Tabla OTHER_STATS ---
other_stats = pd.read_csv("data/raw/other_stats.csv")
other_stats = other_stats.drop(columns=['league_id','team_abbreviation_home'], errors='ignore')
other_stats.columns = other_stats.columns.str.lower()
other_stats.to_csv("data/clean/other_stats.csv", index=False)

# --- Tabla GAME_SUMMARY ---
game_summary = pd.read_csv("data/raw/game_summary.csv")
game_summary = game_summary.drop(columns=['wh_status'], errors='ignore')
game_summary.columns = game_summary.columns.str.lower()
game_summary.to_csv("data/clean/game_summary.csv", index=False)

print("✅ Tablas auxiliares limpiadas correctamente")

✅ Tablas auxiliares limpiadas correctamente


In [70]:
#Filtramos jugadorees de NBA 
player_enriched = pd.read_csv("data/clean/player_enriched.csv")

# Mantener solo jugadores con experiencia NBA
player_enrichedF = player_enriched[player_enriched['nba_flag'] == 1]

player_enrichedF.to_csv("data/clean/player_enrichedF.csv", index=False)
print("✅ Jugadores filtrados (solo con nba_flag = 1)")

✅ Jugadores filtrados (solo con nba_flag = 1)


In [71]:
# --- GAME + TEAM -> Tabla para análisis  ---
game = pd.read_csv("data/raw/game.csv")
team = pd.read_csv("data/clean/team.csv")

game_team = game.merge(team, how='left', left_on='team_id_home', right_on='id')
game_team.columns = game_team.columns.str.lower()
game_team.to_csv("data/clean/game_team.csv", index=False)
print("✅ game_team creado correctamente")

# --- GAME_TEAM + OTHER_STATS -> Tabla para análisis ---
other_stats = pd.read_csv("data/clean/other_stats.csv")

game_team_other_stats = game_team.merge(
    other_stats,
    how='left',
    on=['game_id','game_id']
)
game_team_other_stats.columns = game_team_other_stats.columns.str.lower()
game_team_other_stats.to_csv("data/clean/game_team_other_stats.csv", index=False)

print("✅ game_team_other_stats creado correctamente")

✅ game_team creado correctamente
✅ game_team_other_stats creado correctamente


In [72]:
#Tratamiento de Nulos 

import pandas as pd, os

# Ruta base
path = "data/clean/"
files = [
    "game_team.csv",
    "game_summary.csv",
    "other_stats.csv",
    "player_enriched.csv",
    "team.csv",
    "game_team_other_stats.csv",
    "game.csv"
]

# ------------------------------------------------------
# Función general de limpieza de nulos
# ------------------------------------------------------
def limpiar_nulos(df):
    # 1. Eliminar columnas con más del 80% de nulos
    df = df.loc[:, df.isna().mean() < 0.8]

    # 2. Identificadores críticos → eliminar filas con nulos
    id_cols = [c for c in df.columns if 'id' in c]
    if id_cols:
        df = df.dropna(subset=id_cols)

    # 3. Numéricos → rellenar con mediana o 0
    num_cols = df.select_dtypes(include='number').columns
    for col in num_cols:
        if df[col].isna().mean() > 0:
            if df[col].isna().mean() < 0.3:
                df[col] = df[col].fillna(df[col].median())
            else:
                df[col] = df[col].fillna(0)

    # 4. Categóricos → “Desconocido”
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = df[col].fillna('Desconocido')

    return df

# ------------------------------------------------------
# Bucle de carga y limpieza de todas las tablas
# ------------------------------------------------------
for f in files:
    file_path = os.path.join(path, f)

    # Lectura robusta: tipificar columnas problemáticas
    if "game" in f.lower() or "stats" in f.lower():
        df = pd.read_csv(
            file_path,
            dtype={
                'team_id_home': 'Int64',
                'team_id_away': 'Int64'
            },
            low_memory=False
        )
    else:
        df = pd.read_csv(file_path, low_memory=False)

    # Aplicar limpieza
    df = limpiar_nulos(df)

    # Guardar resultado limpio
    out_path = os.path.join("data/final", f.replace('.csv', '_clean.csv'))
    df.to_csv(out_path, index=False)

    print(f"✅ {f} limpiado → {out_path}")

✅ game_team.csv limpiado → data/final\game_team_clean.csv
✅ game_summary.csv limpiado → data/final\game_summary_clean.csv
✅ other_stats.csv limpiado → data/final\other_stats_clean.csv
✅ player_enriched.csv limpiado → data/final\player_enriched_clean.csv
✅ team.csv limpiado → data/final\team_clean.csv
✅ game_team_other_stats.csv limpiado → data/final\game_team_other_stats_clean.csv
✅ game.csv limpiado → data/final\game_clean.csv


In [74]:
import pandas as pd
import os

path = "data/final/"

# Lista de archivos con los que estás trabajando
files = [
    "game_clean.csv",
    "game_team_clean.csv",
    "game_summary_clean.csv",
    "other_stats_clean.csv",
    "player_enriched_clean.csv",
    "team_clean.csv",
    "game_team_other_stats_clean.csv"
]

# Función para imprimir estadísticas descriptivas columna por columna
def mostrar_estadisticas(df, tabla):
    print(f"\n{'='*70}")
    print(f"📊 ESTADÍSTICAS DESCRIPTIVAS – {tabla.upper()}")
    print(f"{'='*70}")

    # Seleccionar solo columnas numéricas
    num_cols = df.select_dtypes(include='number').columns

    if len(num_cols) == 0:
        print("⚠️ No hay columnas numéricas en esta tabla.")
        return

    # For loop completo: muestra describe() de todas las numéricas
    for col in num_cols:
        print(f"\n➡️ Columna: {col}")
        print(df[col].describe())

    # While loop de ejemplo: solo primeras 5 columnas numéricas
    print(f"\n{'-'*50}\nPrimeras 5 columnas numéricas (while loop):")
    i = 0
    while i < min(5, len(num_cols)):
        col = num_cols[i]
        print(f"\n🔹 Columna (while): {col}")
        print(df[col].describe())
        i += 1

# Ejecutar el análisis sobre cada archivo
for f in files:
    df = pd.read_csv(os.path.join(path, f))
    mostrar_estadisticas(df, f)



📊 ESTADÍSTICAS DESCRIPTIVAS – GAME_CLEAN.CSV

➡️ Columna: season_id
count    65698.000000
mean     22949.338747
std       5000.305500
min      12005.000000
25%      21981.000000
50%      21997.000000
75%      22011.000000
max      42022.000000
Name: season_id, dtype: float64

➡️ Columna: team_id_home
count    6.569800e+04
mean     1.609926e+09
std      3.324313e+07
min      4.500000e+01
25%      1.610613e+09
50%      1.610613e+09
75%      1.610613e+09
max      1.610617e+09
Name: team_id_home, dtype: float64

➡️ Columna: game_id
count    6.569800e+04
mean     2.584747e+07
std      6.303760e+06
min      1.050000e+07
25%      2.130053e+07
50%      2.630007e+07
75%      2.880069e+07
max      4.980009e+07
Name: game_id, dtype: float64

➡️ Columna: min
count    65698.000000
mean       221.003486
std         67.903521
min          0.000000
25%        240.000000
50%        240.000000
75%        240.000000
max        365.000000
Name: min, dtype: float64

➡️ Columna: fgm_home
count    65698.000

REVISAR NULOS


In [None]:
#REVISION DE NULOS RESULTANTES

import pandas as pd
import os

path = "data/final/"
files = [ 
    "game_team_clean.csv",
    "game_summary_clean.csv",
    "other_stats_clean.csv",
    "player_enriched_clean.csv",
    "team_clean.csv",
    "game_team_other_stats_clean.csv",
    "game_clean.csv"
]

for f in files:
    df = pd.read_csv(os.path.join(path, f))
    print(f"\n📊 --- {f} ---")
    print("Filas:", len(df))
    print("Porcentaje de nulos por columna:")
    print((df.isna().mean() * 100).round(2).sort_values(ascending=False).head(10000))


📊 --- game_team_clean.csv ---
Filas: 64413
Porcentaje de nulos por columna:
season_id                 0.0
team_id_home              0.0
team_abbreviation_home    0.0
team_name_home            0.0
game_id                   0.0
                         ... 
full_name                 0.0
nickname                  0.0
city                      0.0
state                     0.0
year_founded              0.0
Length: 61, dtype: float64

📊 --- game_summary_clean.csv ---
Filas: 58110
Porcentaje de nulos por columna:
game_date_est             0.0
game_sequence             0.0
game_id                   0.0
game_status_id            0.0
game_status_text          0.0
gamecode                  0.0
home_team_id              0.0
visitor_team_id           0.0
season                    0.0
live_period               0.0
live_period_time_bcast    0.0
dtype: float64

📊 --- other_stats_clean.csv ---
Filas: 28271
Porcentaje de nulos por columna:
game_id                   0.0
team_id_home              0.0
te

In [None]:
print(player_enriched.columns)


Index(['id', 'full_name', 'first_name_x', 'last_name_x', 'is_active',
       'person_id', 'first_name_y', 'last_name_y', 'school', 'country',
       'height', 'weight', 'season_exp', 'jersey', 'position', 'rosterstatus',
       'team_id', 'team_name', 'team_city', 'from_year', 'to_year',
       'dleague_flag', 'nba_flag', 'games_played_flag', 'draft_year',
       'draft_round', 'draft_number', 'greatest_75_flag', 'edad',
       'años_jugando', 'undrafted'],
      dtype='object')
