In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

def extraction_webscrapping(url, output_file, header_tag, keep_columns=None):
    response = requests.get(url)
    if response.status_code == 200:
        print("Successfully fetched the webpage.")
    else:
        print(f"Failed to fetch the webpage: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    
    headers = [th.text.strip() for th in table.find('thead').find_all(header_tag)]
    print(f"Headers found: {headers}")
    
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        cells = tr.find_all('td')
        row = [cell.text.strip() for cell in cells]
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=headers)
    
    if keep_columns:
        print(f"Filtering to keep columns: {keep_columns}")
        df = df[keep_columns]

    # Ensure only the top 50 players
    df = df.head(50)
    
    # Drop unnecessary columns
    df.drop(columns=['Team', 'Pos'], inplace=True, errors='ignore')
    
    output_path = os.path.join('csv', output_file)
    df.to_csv(output_path, index=False)
    print(f"Data successfully scraped and saved to {output_file}")

# Fetch and parse each required stat, keeping only the necessary columns and limiting to top 50 players
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/points", 'player-stat_2024.csv', 'th', ['Player', 'Value'])
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/assists", 'assists_2024.csv', 'th', ['Player', 'Value'])
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/rebounds-offensive", 'rebounds-offensive.csv', 'th', ['Player', 'Value'])
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/rebounds-defensive", 'rebounds-defensive.csv', 'th', ['Player', 'Value'])
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/blocks", 'blocks.csv', 'th', ['Player', 'Value'])
extraction_webscrapping("https://www.teamrankings.com/nba/player-stat/steals", 'steals.csv', 'th', ['Player', 'Value'])

In [None]:
import pandas as pd
import os

def load_and_extract_all(csv_file, value_column, col_rename):
    df = pd.read_csv(os.path.join('csv', csv_file))
    df.rename(columns={value_column: col_rename}, inplace=True)
    return df

# Load data for each relevant statistic
df_assists = load_and_extract_all('assists_2024.csv', 'Value', 'AST')
df_player_stats = load_and_extract_all('player-stat_2024.csv', 'Value', 'PTS')
df_OR = load_and_extract_all('rebounds-offensive.csv', 'Value', 'ORB')
df_DRB = load_and_extract_all('rebounds-defensive.csv', 'Value', 'DRB')
df_BLK = load_and_extract_all('blocks.csv', 'Value', 'BLK')
df_STL = load_and_extract_all('steals.csv', 'Value', 'STL')

# Ensure player names consistency
def ensure_consistent_names(df):
    df['Player'] = df['Player'].str.strip()
    return df

df_assists = ensure_consistent_names(df_assists)
df_player_stats = ensure_consistent_names(df_player_stats)
df_OR = ensure_consistent_names(df_OR)
df_DRB = ensure_consistent_names(df_DRB)
df_BLK = ensure_consistent_names(df_BLK)
df_STL = ensure_consistent_names(df_STL)

# Merge dataframes for Offensive PER
df_combined_offense = df_player_stats.merge(df_assists, on="Player", how='outer').merge(df_OR, on="Player", how='outer')

# Calculate Offensive PER only where data is complete
df_combined_offense['O_PER'] = df_combined_offense[['PTS', 'AST', 'ORB']].dropna().apply(lambda row: (row['PTS'] + row['AST'] + row['ORB']) / 3, axis=1).round(1)

# Retain top 50 players based on Offensive PER
df_combined_offense = df_combined_offense.dropna(subset=['O_PER']).sort_values(by='O_PER', ascending=False).head(50)
df_combined_offense.to_csv('nba_top_50_offensive_per.csv', index=False)
print("Top 50 Offensive PER data saved to 'nba_top_50_offensive_per.csv'.")

# Merge dataframes for Defensive PER
df_combined_defense = df_DRB.merge(df_BLK, on="Player", how='outer').merge(df_STL, on="Player", how='outer')

# Calculate Defensive PER only where data is complete
df_combined_defense['D_PER'] = df_combined_defense[['DRB', 'BLK', 'STL']].dropna().apply(lambda row: (row['DRB'] + row['BLK'] + row['STL']) / 3, axis=1).round(1)

# Retain top 50 players based on Defensive PER
df_combined_defense = df_combined_defense.dropna(subset=['D_PER']).sort_values(by='D_PER', ascending=False).head(50)
df_combined_defense.to_csv('nba_top_50_defensive_per.csv', index=False)
print("Top 50 Defensive PER data saved to 'nba_top_50_defensive_per.csv'.")