In [27]:
import pandas as pd

# 1️⃣ Cargar archivos
games = pd.read_csv("../data/Games.csv", parse_dates=["gameDate"])
team_stats = pd.read_csv("../data/TeamStatistics.csv", parse_dates=["gameDate"])

# 2️⃣ Filtrar juegos desde 2015
games = games[games["gameDate"].dt.year >= 2015]
team_stats = team_stats[team_stats["gameDate"].dt.year >= 2015]

# 3️⃣ Mantener solo columnas importantes
games = games[["gameId", "gameDate", "hometeamName", "awayteamName", "homeScore", "awayScore"]]
team_stats = team_stats[[
    "gameId", "gameDate", "teamName", "opponentTeamName", "home", "teamScore", "opponentScore",
    "assists", "reboundsTotal", "steals", "blocks", "turnovers", "fieldGoalsPercentage",
    "threePointersPercentage", "freeThrowsPercentage", "seasonWins", "seasonLosses"
]]

# 4️⃣ Crear etiqueta home_win
games["home_win"] = (games["homeScore"] > games["awayScore"]).astype(int)

# 5️⃣ Revisar resultados
print("Games shape:", games.shape)
print("TeamStatistics shape:", team_stats.shape)
print("Primeros juegos:\n", games.tail())


  games = pd.read_csv("../data/Games.csv", parse_dates=["gameDate"])


Games shape: (14414, 7)
TeamStatistics shape: (28828, 17)
Primeros juegos:
          gameId            gameDate  hometeamName awayteamName  homeScore  \
14409  21400487 2015-01-02 19:30:00        Knicks      Pistons         81   
14410  21400484 2015-01-02 19:00:00       Hornets    Cavaliers         87   
14411  21400485 2015-01-02 19:00:00         Magic         Nets         98   
14412  21400482 2015-01-01 20:00:00         Bulls      Nuggets        106   
14413  21400483 2015-01-01 20:00:00  Timberwolves        Kings        107   

       awayScore  home_win  
14409         97         0  
14410         91         0  
14411        100         0  
14412        101         1  
14413        110         0  


In [28]:
# Stats de home team
home_stats = team_stats[team_stats["home"] == 1].copy()
home_stats = home_stats.rename(columns={
    "teamScore": "home_teamScore",
    "opponentScore": "home_opponentScore",
    "assists": "home_assists",
    "reboundsTotal": "home_reboundsTotal",
    "steals": "home_steals",
    "blocks": "home_blocks",
    "turnovers": "home_turnovers",
    "fieldGoalsPercentage": "home_FG%",
    "threePointersPercentage": "home_3P%",
    "freeThrowsPercentage": "home_FT%",
    "seasonWins": "home_seasonWins",
    "seasonLosses": "home_seasonLosses"
})

# Stats de away team
away_stats = team_stats[team_stats["home"] == 0].copy()
away_stats = away_stats.rename(columns={
    "teamScore": "away_teamScore",
    "opponentScore": "away_opponentScore",
    "assists": "away_assists",
    "reboundsTotal": "away_reboundsTotal",
    "steals": "away_steals",
    "blocks": "away_blocks",
    "turnovers": "away_turnovers",
    "fieldGoalsPercentage": "away_FG%",
    "threePointersPercentage": "away_3P%",
    "freeThrowsPercentage": "away_FT%",
    "seasonWins": "away_seasonWins",
    "seasonLosses": "away_seasonLosses"
})


# Unir stats de home team (usando 'inner')
games = games.merge(home_stats[[
    "gameId", "home_teamScore", "home_opponentScore", "home_assists", "home_reboundsTotal",
    "home_steals", "home_blocks", "home_turnovers", "home_FG%", "home_3P%", "home_FT%",
    "home_seasonWins", "home_seasonLosses"
]], on="gameId", how="inner") # <--- CAMBIO AQUÍ

# Unir stats de away team (usando 'inner')
games = games.merge(away_stats[[
    "gameId", "away_teamScore", "away_opponentScore", "away_assists", "away_reboundsTotal",
    "away_steals", "away_blocks", "away_turnovers", "away_FG%", "away_3P%", "away_FT%",
    "away_seasonWins", "away_seasonLosses"
]], on="gameId", how="inner") # <--- CAMBIO AQUÍ

# (Opcional) Verifica que ya no hay NaNs
print(games.isnull().sum())
# Winrate acumulado
games["home_winrate"] = (games["home_seasonWins"] / (games["home_seasonWins"] + games["home_seasonLosses"])).fillna(0)
games["away_winrate"] = (games["away_seasonWins"] / (games["away_seasonWins"] + games["away_seasonLosses"])).fillna(0)

# (Opcional) Revisa el resultado para ver las nuevas columnas
print("Vista previa con las nuevas columnas de winrate:")
display(games.tail())


# Eliminar todas las filas donde falte al menos una de las estadísticas clave
# Nos enfocamos en 'home_seasonWins' ya que es la que tiene más valores nulos
games.dropna(subset=['home_seasonWins', 'home_assists'], inplace=True)

# Verificar que la limpieza fue exitosa
print("Conteo de nulos después de la limpieza:")
print(games.isnull().sum())

print("\nNuevas dimensiones del DataFrame:")
print(games.shape)



gameId                    0
gameDate                  0
hometeamName              0
awayteamName              0
homeScore                 0
awayScore                 0
home_win                  0
home_teamScore            0
home_opponentScore        0
home_assists              2
home_reboundsTotal        2
home_steals               2
home_blocks               2
home_turnovers            2
home_FG%                  2
home_3P%                  2
home_FT%                  2
home_seasonWins       13039
home_seasonLosses     13039
away_teamScore            0
away_opponentScore        0
away_assists              2
away_reboundsTotal        2
away_steals               2
away_blocks               2
away_turnovers            2
away_FG%                  2
away_3P%                  2
away_FT%                  3
away_seasonWins       13039
away_seasonLosses     13039
dtype: int64
Vista previa con las nuevas columnas de winrate:


Unnamed: 0,gameId,gameDate,hometeamName,awayteamName,homeScore,awayScore,home_win,home_teamScore,home_opponentScore,home_assists,...,away_steals,away_blocks,away_turnovers,away_FG%,away_3P%,away_FT%,away_seasonWins,away_seasonLosses,home_winrate,away_winrate
14409,21400487,2015-01-02 19:30:00,Knicks,Pistons,81,97,0,81,97,18.0,...,10.0,3.0,17.0,0.5,0.394,0.571,,,0.0,0.0
14410,21400484,2015-01-02 19:00:00,Hornets,Cavaliers,87,91,0,87,91,20.0,...,8.0,5.0,7.0,0.379,0.267,0.7,,,0.0,0.0
14411,21400485,2015-01-02 19:00:00,Magic,Nets,98,100,0,98,100,22.0,...,7.0,5.0,22.0,0.569,0.35,0.611,,,0.0,0.0
14412,21400482,2015-01-01 20:00:00,Bulls,Nuggets,106,101,1,106,101,22.0,...,4.0,7.0,13.0,0.402,0.357,0.815,,,0.0,0.0
14413,21400483,2015-01-01 20:00:00,Timberwolves,Kings,107,110,0,107,110,22.0,...,7.0,7.0,20.0,0.541,0.417,0.862,,,0.0,0.0


Conteo de nulos después de la limpieza:
gameId                0
gameDate              0
hometeamName          0
awayteamName          0
homeScore             0
awayScore             0
home_win              0
home_teamScore        0
home_opponentScore    0
home_assists          0
home_reboundsTotal    0
home_steals           0
home_blocks           0
home_turnovers        0
home_FG%              0
home_3P%              0
home_FT%              0
home_seasonWins       0
home_seasonLosses     0
away_teamScore        0
away_opponentScore    0
away_assists          0
away_reboundsTotal    0
away_steals           0
away_blocks           0
away_turnovers        0
away_FG%              0
away_3P%              0
away_FT%              0
away_seasonWins       0
away_seasonLosses     0
home_winrate          0
away_winrate          0
dtype: int64

Nuevas dimensiones del DataFrame:
(1373, 33)
