In [1]:
import sys
import os

# Add project root directory to PYTHONPATH:
project_root = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), 'project'))
sys.path.insert(0, project_root)

# Import extraction functions:
from data_extraction.nba import extract_nba_player_salaries, extract_nba_team_salaries
from data_extraction.wnba import extract_wnba_player_salaries, extract_wnba_team_salaries

# Extract NBA player salaries:
extract_nba_player_salaries("https://hoopshype.com/salaries/players/2023-2024/", 'nba_player_salaries_2024.csv')

# Extract NBA team salaries:
extract_nba_team_salaries("https://hoopshype.com/salaries/2023-2024/", 'nba_team_salaries_2024.csv')

# Extract WNBA player salaries:
extract_wnba_player_salaries("https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/", 'wnba_player_salaries_2024.csv')

# Extract WNBA team salaries:
extract_wnba_team_salaries("https://herhoopstats.com/salary-cap-sheet/wnba/summary/2024/", 'wnba_team_salaries_2024.csv')

Successfully fetched the webpage: https://hoopshype.com/salaries/players/2023-2024/
Headers found: ['', 'Player', '2023/24', '2023/24(*)']
Data successfully scraped and saved to nba_player_salaries_2024.csv
Successfully fetched the webpage: https://hoopshype.com/salaries/2023-2024/
Headers found: ['', 'Team', '2023/24', '2023/24(*)']
Data successfully scraped and saved to nba_team_salaries_2024.csv
Successfully fetched the webpage: https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/
Data successfully scraped and saved to wnba_player_salaries_2024.csv
Successfully fetched the webpage: https://herhoopstats.com/salary-cap-sheet/wnba/summary/2024/
Headers found: ['Team', 'Total Salaries', 'Total Players', 'Cap Room', 'Guaranteed Salaries']
Filtering to keep columns: ['Team', 'Total Salaries']
Data successfully scraped and saved to wnba_team_salaries_2024.csv


In [2]:
# Converting to NUMERIC placeholder:

"""import pandas as pd

# Loading CSV:
df = pd.read_csv('wnba_player_salaries_2024.csv')

# Convert Salary column to numeric:
df = df[df['2024 Salary'].str.startswith('$', na=False)]
df['2024 Salary'] = df['2024 Salary'].replace(r'[\$,]', '', regex=True).astype(float)

# Remove duplicates, keep highest salary:
df_cleaned = df.sort_values('2024 Salary', ascending=False).drop_duplicates(subset=['Player'], keep='first')

# Created new "cleaned_" CSV:
df_cleaned.to_csv('cleaned_wnba_player_salaries_2024.csv', index=False)
print("Cleaned data saved to 'cleaned_wnba_player_salaries_2024.csv'.")"""

  """import pandas as pd


'import pandas as pd\n\n# Loading CSV:\ndf = pd.read_csv(\'wnba_player_salaries_2024.csv\')\n\n# Convert Salary column to numeric:\ndf = df[df[\'2024 Salary\'].str.startswith(\'$\', na=False)]\ndf[\'2024 Salary\'] = df[\'2024 Salary\'].replace(r\'[\\$,]\', \'\', regex=True).astype(float)\n\n# Remove duplicates, keep highest salary:\ndf_cleaned = df.sort_values(\'2024 Salary\', ascending=False).drop_duplicates(subset=[\'Player\'], keep=\'first\')\n\n# Created new "cleaned_" CSV:\ndf_cleaned.to_csv(\'cleaned_wnba_player_salaries_2024.csv\', index=False)\nprint("Cleaned data saved to \'cleaned_wnba_player_salaries_2024.csv\'.")'

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL for WNBA Offensive Stats
offensive_url = "https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/"

# Function to fetch and parse data
def fetch_and_parse(url):
    response = requests.get(url)
    if response.status_code == 200:
        print(f"Successfully fetched the webpage: {url}")
    else:
        raise Exception(f"Failed to fetch the webpage: {response.status_code}")
    return BeautifulSoup(response.content, 'html.parser')

# Function to extract relevant table data and clean it
def extract_and_clean_relevant_data(soup, relevant_columns):
    table = soup.find('table')
    headers = [th.text.strip() for th in table.find('thead').find_all('th')]
    columns_indices = {header: index for index, header in enumerate(headers) if header in relevant_columns}

    rows = []
    for tr in table.find('tbody').find_all('tr'):
        cells = tr.find_all('td')
        if len(cells) < len(relevant_columns):
            continue  # Skip rows with incomplete data
        row = {}
        for header in relevant_columns:
            index = columns_indices[header]
            if index < len(cells):
                value = cells[index].text.strip().split('\n')[0]
                row[header] = value
            else:
                row[header] = None
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=relevant_columns)
    return df

# Fetch, parse, and extract data for Offensive Players
soup_offensive = fetch_and_parse(offensive_url)
relevant_columns_offensive = ["Player", "PTS", "AST", "ORB"]
offensive_df = extract_and_clean_relevant_data(soup_offensive, relevant_columns_offensive)

print("Initial DataFrame after extracting relevant columns (Offensive):")
print(offensive_df.head())

# Clean the DataFrame
offensive_df = offensive_df.dropna(subset=["PTS", "AST", "ORB"])
offensive_df["PTS"] = pd.to_numeric(offensive_df["PTS"], errors="coerce")
offensive_df["AST"] = pd.to_numeric(offensive_df["AST"], errors="coerce")
offensive_df["ORB"] = pd.to_numeric(offensive_df["ORB"], errors="coerce")
offensive_df = offensive_df.dropna(subset=["PTS", "AST", "ORB"])

# Extract Top 50 Scorers
top_50_offensive_df = offensive_df.nlargest(50, "PTS")

# Calculate Offensive PER
top_50_offensive_df["O-PER"] = (top_50_offensive_df["PTS"] + top_50_offensive_df["AST"] + top_50_offensive_df["ORB"]) / 3

# Round O-PER to one decimal place
top_50_offensive_df["O-PER"] = top_50_offensive_df["O-PER"].round(1)

# Ensure only relevant columns are included in the final CSV
final_columns_offensive = ["Player", "PTS", "AST", "ORB", "O-PER"]
top_50_offensive_df = top_50_offensive_df[final_columns_offensive]

# Save to CSV
top_50_offensive_df.to_csv("wnba_top_50_offensive_per.csv", index=False)
print("Top 50 Offensive data with O-PER saved to 'wnba_top_50_offensive_per.csv'")

print("Final Cleaned Offensive DataFrame with O-PER:")
print(top_50_offensive_df.head())

Successfully fetched the webpage: https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/
Initial DataFrame after extracting relevant columns (Offensive):
             Player   PTS  AST  ORB
0       A'ja Wilson  26.9  2.3  2.1
1  Arike Ogunbowale  22.2  5.1  0.8
2    Kahleah Copper  21.1  2.3  0.6
3   Breanna Stewart  20.4  3.5  1.6
4  Napheesa Collier  20.4  3.4  2.2
Top 50 Offensive data with O-PER saved to 'wnba_top_50_offensive_per.csv'
Final Cleaned Offensive DataFrame with O-PER:
             Player   PTS  AST  ORB  O-PER
0       A'ja Wilson  26.9  2.3  2.1   10.4
1  Arike Ogunbowale  22.2  5.1  0.8    9.4
2    Kahleah Copper  21.1  2.3  0.6    8.0
3   Breanna Stewart  20.4  3.5  1.6    8.5
4  Napheesa Collier  20.4  3.4  2.2    8.7


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL for WNBA Defensive Stats
defensive_url = "https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/"

# Function to fetch and parse data
def fetch_and_parse(url):
    response = requests.get(url)
    if response.status_code == 200:
        print(f"Successfully fetched the webpage: {url}")
    else:
        raise Exception(f"Failed to fetch the webpage: {response.status_code}")
    return BeautifulSoup(response.content, 'html.parser')

# Function to extract relevant table data and clean it
def extract_and_clean_relevant_data(soup, relevant_columns):
    table = soup.find('table')
    headers = [th.text.strip() for th in table.find('thead').find_all('th')]
    columns_indices = {header: index for index, header in enumerate(headers) if header in relevant_columns}

    rows = []
    for tr in table.find('tbody').find_all('tr'):
        cells = tr.find_all('td')
        if len(cells) < len(relevant_columns):
            continue  # Skip rows with incomplete data
        row = {}
        for header in relevant_columns:
            index = columns_indices[header]
            if index < len(cells):
                value = cells[index].text.strip().split('\n')[0]
                row[header] = value
            else:
                row[header] = None
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=relevant_columns)
    return df

# Fetch, parse, and extract data for Defensive Players
soup_defensive = fetch_and_parse(defensive_url)
relevant_columns_defensive = ["Player", "DRB", "BLK", "STL"]
defensive_df = extract_and_clean_relevant_data(soup_defensive, relevant_columns_defensive)

print("Initial DataFrame after extracting relevant columns (Defensive):")
print(defensive_df.head())

# Clean the DataFrame
defensive_df = defensive_df.dropna(subset=["DRB", "BLK", "STL"])
defensive_df["DRB"] = pd.to_numeric(defensive_df["DRB"], errors="coerce")
defensive_df["BLK"] = pd.to_numeric(defensive_df["BLK"], errors="coerce")
defensive_df["STL"] = pd.to_numeric(defensive_df["STL"], errors="coerce")
defensive_df = defensive_df.dropna(subset=["DRB", "BLK", "STL"])

# Extract Top 50 Defensive Rebounders
top_50_defensive_df = defensive_df.nlargest(50, "DRB")

# Calculate Defensive PER
top_50_defensive_df["D-PER"] = (top_50_defensive_df["DRB"] + top_50_defensive_df["BLK"] + top_50_defensive_df["STL"]) / 3

# Round D-PER to one decimal place
top_50_defensive_df["D-PER"] = top_50_defensive_df["D-PER"].round(1)

# Ensure only relevant columns are included in the final CSV
final_columns_defensive = ["Player", "DRB", "BLK", "STL", "D-PER"]
top_50_defensive_df = top_50_defensive_df[final_columns_defensive]

# Save to CSV
top_50_defensive_df.to_csv("wnba_top_50_defensive_per.csv", index=False)
print("Top 50 Defensive data with D-PER saved to 'wnba_top_50_defensive_per.csv'")

print("Final Cleaned Defensive DataFrame with D-PER:")
print(top_50_defensive_df.head())

Successfully fetched the webpage: https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/
Initial DataFrame after extracting relevant columns (Defensive):
             Player  DRB  BLK  STL
0       A'ja Wilson  9.8  2.6  1.8
1  Arike Ogunbowale  3.8  0.3  2.1
2    Kahleah Copper  3.9  0.1  0.8
3   Breanna Stewart  6.9  1.3  1.7
4  Napheesa Collier  7.5  1.4  1.9
Top 50 Defensive data with D-PER saved to 'wnba_top_50_defensive_per.csv'
Final Cleaned Defensive DataFrame with D-PER:
              Player  DRB  BLK  STL  D-PER
0        A'ja Wilson  9.8  2.6  1.8    4.7
29       Angel Reese  8.1  0.5  1.3    3.3
15     Dearica Hamby  7.7  0.2  1.7    3.2
4   Napheesa Collier  7.5  1.4  1.9    3.6
26     Jonquel Jones  7.3  1.3  0.8    3.1
