In [None]:
# Function to fetch and parse data (existing code)
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

def extraction_webscrapping(url, output_file, header_tag, keep_columns=None):
    response = requests.get(url)
    if response.status_code == 200:
        print("Successfully fetched the webpage.")
    else:
        print(f"Failed to fetch the webpage: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    
    headers = [th.text.strip() for th in table.find('thead').find_all(header_tag)]
    print(f"Headers found: {headers}")
    
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        cells = tr.find_all('td')
        row = [cell.text.strip() for cell in cells]
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=headers)
    
    if keep_columns:
        print(f"Filtering to keep columns: {keep_columns}")
        df = df[keep_columns]
    
    output_path = os.path.join('csv', output_file)
    df.to_csv(output_path, index=False)
    print(f"Data successfully scraped and saved to {output_file}")

# Fetch and parse each required stat (adjusted for top 50)
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/points", 'player-stat_2024.csv', 'th')
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/assists", 'assists_2024.csv', 'th')
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/rebounds-offensive", 'rebounds-offensive.csv', 'th')
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/rebounds-defensive", 'rebounds-defensive.csv', 'th')
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/blocks", 'blocks.csv', 'th')
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/steals", 'steals.csv', 'th')

# Ensure only top 50 players are considered
def load_and_extract_top_50(csv_file, value_column, col_rename):
    df = pd.read_csv(os.path.join('csv', csv_file))
    df = df.head(50)  # Ensure only the top 50
    df.rename(columns={value_column: col_rename}, inplace=True)
    return df

df_assists = load_and_extract_top_50('assists_2024.csv', 'Value', 'AST')
df_player_stats = load_and_extract_top_50('player-stat_2024.csv', 'Value', 'PTS')
df_OR = load_and_extract_top_50('rebounds-offensive.csv', 'Value', 'ORB')
df_DRB = load_and_extract_top_50('rebounds-defensive.csv', 'Value', 'DRB')
df_BLK = load_and_extract_top_50('blocks.csv', 'Value', 'BLK')
df_STL = load_and_extract_top_50('steals.csv', 'Value', 'STL')

# Merge the DataFrames and calculate PER
df_combined = df_assists.merge(df_player_stats, on=["Player"], how='outer')
df_combined = df_combined.merge(df_OR, on=["Player"], how='outer')
df_combined.dropna(subset=["PTS", "AST", "ORB"], inplace=True)  # Ensure only rows with complete data

# Calculate Offensive PER
df_combined['O_PER'] = ((df_combined['PTS'] + df_combined['AST'] + df_combined['ORB']) / 3).round(1)
df_combined = df_combined.head(50)  # Retain top 50
df_combined.to_csv('nba_top_50_offensive_per.csv', index=False)
print("Top 50 Offensive PER data saved to 'nba_top_50_offensive_per.csv'.")

# For Defensive PER
df_defense = df_BLK.merge(df_DRB, on=["Player"], how='outer')
df_defense = df_defense.merge(df_STL, on=["Player"], how='outer')
df_defense.dropna(subset=["BLK", "DRB", "STL"], inplace=True)  # Ensure only rows with complete data

# Calculate Defensive PER
df_defense['D_PER'] = ((df_defense['BLK'] + df_defense['DRB'] + df_defense['STL']) / 3).round(1)
df_defense = df_defense.head(50)  # Retain top 50
df_defense.to_csv('nba_top_50_defensive_per.csv', index=False)
print("Top 50 Defensive PER data saved to 'nba_top_50_defensive_per.csv'.")