In [1]:
# !pip install selenium
# !pip install chromedriver-py==94.0.4606.41
# !pip install requests

# libraries
import os
import pandas as pd
import shutil
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup as bs
from chromedriver_py import binary_path
import requests
import warnings; warnings.filterwarnings("ignore")

# unhide all rows and columns
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

# 1. Ranking of top 15 players in principal stats

In [3]:
years = list(range(1991,2025))

In [2]:
file_path = "raw_data/players.csv"
if os.path.exists(file_path):
    players = pd.read_csv(file_path)

multi_team_players = players[players["Team"].isin(["2TM", "3TM", "4TM", "5TM", "6TM", "7TM", "8TM", "9TM", "10TM"])]

players = players[~players["Team"].isin(["2TM", "3TM", "4TM", "5TM", "6TM", "7TM", "8TM", "9TM", "10TM"])]

for _, player in multi_team_players.iterrows():
    player_name = player["Player"]
    player_year = player["Year"]

    # Filtrar las filas del mismo jugador y año
    player_rows = players[(players["Player"] == player_name) & (players["Year"] == player_year)]
    
    if not player_rows.empty:
        # Obtener el último equipo en el que jugó
        last_team = player_rows.iloc[-1]["Team"]

        # Sumar las estadísticas del jugador en toda la temporada
        total_stats = player_rows.iloc[:, 5:].sum(numeric_only=True)  # Sumar estadísticas numéricas
        total_stats["Player"] = player_name
        total_stats["Team"] = last_team
        total_stats["Year"] = player_year

        # Reemplazar las filas anteriores con la fila consolidada
        players = players[players["Player"] != player_name]  # Eliminar duplicados
        players = pd.concat([players, pd.DataFrame([total_stats])], ignore_index=True)

# Convert the 'G' column to numeric, forcing errors to NaN and then dropping them
players["G"] = pd.to_numeric(players["G"], errors='coerce')
players = players.dropna(subset=["G"])

# Convert 'G' column to integer
players["G"] = players["G"].astype(int)

players = players[players["G"] >= 50]

In [4]:
players

Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards,Year
0,1,Michael Jordan,27,CHI,SG,82,82,37.0,12.1,22.4,.539,0.4,1.1,.312,11.7,21.3,.551,.547,7.0,8.2,.851,1.4,4.6,6.0,5.5,2.7,1.0,2.5,2.8,31.5,"MVP-1,DPOY-7,AS,NBA1",1991
1,2,Karl Malone,27,UTA,PF,82,82,40.3,10.3,19.6,.527,0.0,0.2,.286,10.3,19.4,.529,.528,8.3,10.8,.770,2.9,8.9,11.8,3.3,1.1,1.0,3.0,3.3,29.0,"MVP-5,AS,NBA1",1991
2,3,Bernard King,34,WSB,SF,64,64,37.5,11.1,23.6,.472,0.1,0.6,.216,11.0,23.0,.478,.475,6.0,7.6,.790,1.8,3.2,5.0,4.6,0.9,0.3,4.0,2.9,28.4,"MVP-16,AS,NBA3",1991
3,4,Charles Barkley,27,PHI,SF,67,67,37.3,9.9,17.4,.570,0.7,2.3,.284,9.3,15.1,.614,.589,7.1,9.8,.722,3.9,6.3,10.1,4.2,1.6,0.5,3.1,2.6,27.6,"MVP-4,AS,NBA1",1991
4,5,Patrick Ewing,28,NYK,C,81,81,38.3,10.4,20.3,.514,0.0,0.1,.000,10.4,20.2,.516,.514,5.7,7.7,.745,2.4,8.8,11.2,3.0,1.0,3.2,3.6,3.5,26.6,"MVP-11,DPOY-7,AS,NBA2",1991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7090,586,Zeke Nnaji,23,DEN,PF,58,0,9.9,1.2,2.6,.463,0.1,0.4,.261,1.1,2.2,.500,.483,0.7,1.1,.677,1.1,1.1,2.2,0.6,0.3,0.7,0.5,1.4,3.2,,2024
7091,588,JT Thor,21,CHO,PF,63,3,12.4,1.3,2.9,.437,0.4,1.3,.346,0.8,1.6,.510,.514,0.2,0.3,.550,0.7,1.6,2.3,0.5,0.2,0.4,0.2,1.0,3.2,,2024
7093,591,Dalen Terry,21,CHI,SG,59,2,11.5,1.2,2.7,.439,0.3,1.3,.230,0.9,1.4,.627,.494,0.4,0.7,.581,0.5,1.4,1.9,1.4,0.5,0.3,0.5,1.4,3.1,,2024
7095,597,Johnny Davis,21,WAS,SG,50,6,12.3,1.2,3.1,.403,0.3,0.8,.350,1.0,2.3,.421,.448,0.3,0.5,.583,0.4,1.1,1.4,0.6,0.4,0.2,0.3,1.1,3.0,,2024


In [5]:
# Columnas de estadísticas a rankear
stats_to_rank = ["PTS", "AST", "DRB", "ORB", "FT", "STL", "BLK", "3P"]

# Lista para almacenar los rankings en un DataFrame
ranking_list = []

for year in years:
    for stat in stats_to_rank:
        top_players = players[players["Year"] == year][["Player", "Team", "Year", stat]].sort_values(by=stat, ascending=False).head(15)
        
        for _, row in top_players.iterrows():
            ranking_list.append({
                "Player": row["Player"],
                "Year": row["Year"],
                "Team": row["Team"],
                "Stat": stat,
                "Value-Stat": row[stat]
            })

# Convertir el ranking a DataFrame
ranking_df = pd.DataFrame(ranking_list)

# Guardar en CSV
output_path = "procesed_data/top15_rankings.csv"
ranking_df.to_csv(output_path, index=False)

print(f"Rankings guardados en {output_path}")

Rankings guardados en procesed_data/top15_rankings.csv


# 2. Accounting players in the rank by team

In [7]:
file_path = "procesed_data/top15_rankings.csv"  # Asegúrate de usar el archivo correcto
df = pd.read_csv(file_path)

# Crear un diccionario para contar cuántos jugadores de cada equipo aparecen en cada estadística
team_year_stats = {}

# Iterar sobre las filas del DataFrame de rankings
for _, row in df.iterrows():
    team = row["Team"]
    year = row["Year"]
    stat = row["Stat"]

    # Clave única por equipo y año
    key = (team, year)

    # Si el equipo/año no está en el diccionario, lo inicializamos
    if key not in team_year_stats:
        team_year_stats[key] = {f"top15_{s}": 0 for s in df["Stat"].unique()}  # Inicializar todas las estadísticas en 0
        team_year_stats[key]["TEAM"] = team
        team_year_stats[key]["YEAR"] = year

    # Incrementar el conteo de la estadística correspondiente
    team_year_stats[key][f"top15_{stat}"] += 1

# Convertir el diccionario en un DataFrame
accounting_df = pd.DataFrame(team_year_stats.values())

# Guardar en CSV
output_path = "procesed_data/top15_rankings_by_team..csv"
accounting_df.to_csv(output_path, index=False)

print(f"Accounting guardado en {output_path}")

Accounting guardado en procesed_data/top15_rankings_by_team..csv
