In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), 'project'))
sys.path.insert(0, project_root)

from data_extraction.nba import extract_nba_player_salaries, extract_nba_team_salaries
from data_extraction.wnba import extract_wnba_player_salaries, extract_wnba_team_salaries

extract_nba_player_salaries("https://hoopshype.com/salaries/players/2023-2024/", 'nba_player_salaries_2024.csv')
extract_nba_team_salaries("https://hoopshype.com/salaries/2023-2024/", 'nba_team_salaries_2024.csv')
extract_wnba_player_salaries("https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/", 'wnba_player_salaries_2024.csv')
extract_wnba_team_salaries("https://herhoopstats.com/salary-cap-sheet/wnba/summary/2024/", 'wnba_team_salaries_2024.csv')


Successfully fetched the webpage: https://hoopshype.com/salaries/players/2023-2024/
Headers found: ['', 'Player', '2023/24', '2023/24(*)']
Data successfully scraped and saved to nba_player_salaries_2024.csv
Successfully fetched the webpage: https://hoopshype.com/salaries/2023-2024/
Headers found: ['', 'Team', '2023/24', '2023/24(*)']
Data successfully scraped and saved to nba_team_salaries_2024.csv
Successfully fetched the webpage: https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/
Data successfully scraped and saved to wnba_player_salaries_2024.csv
Successfully fetched the webpage: https://herhoopstats.com/salary-cap-sheet/wnba/summary/2024/
Headers found: ['Team', 'Total Salaries', 'Total Players', 'Cap Room', 'Guaranteed Salaries']
Filtering to keep columns: ['Team', 'Total Salaries']
Data successfully scraped and saved to wnba_team_salaries_2024.csv


In [2]:
import pandas as pd

# Loading CSV:
df = pd.read_csv('wnba_player_salaries_2024.csv')

# Convert Salary column to numeric:
df = df[df['2024 Salary'].str.startswith('$', na=False)]
df['2024 Salary'] = df['2024 Salary'].replace(r'[\$,]', '', regex=True).astype(float)

# Remove duplicates, keep highest salary:
df_cleaned = df.sort_values('2024 Salary', ascending=False).drop_duplicates(subset=['Player'], keep='first')

# Created new "cleaned_" CSV:
df_cleaned.to_csv('cleaned_wnba_player_salaries_2024.csv', index=False)
print("Cleaned data saved to 'cleaned_wnba_player_salaries_2024.csv'.")

Cleaned data saved to 'cleaned_wnba_player_salaries_2024.csv'.


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL for WNBA Stats
url = "https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/"

# Function to fetch and parse data
def fetch_and_parse(url):
    response = requests.get(url)
    if response.status_code == 200:
        print(f"Successfully fetched the webpage: {url}")
    else:
        raise Exception(f"Failed to fetch the webpage: {response.status_code}")
    return BeautifulSoup(response.content, 'html.parser')

# Function to extract relevant table data
def extract_relevant_data(soup):
    table = soup.find('table')

    # Extract headers
    headers = [th.text.strip() for th in table.find('thead').find_all('th')]

    # Identify the relevant columns indices
    relevant_columns = {header: index for index, header in enumerate(headers) if header in ["Player", "G", "PTS", "AST"]}
    
    # Extract rows with necessary columns
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        cells = tr.find_all('td')
        row = {header: cells[idx].text.strip() for header, idx in relevant_columns.items()}
        rows.append(row)
    
    # Create DataFrame with extracted columns
    df = pd.DataFrame(rows, columns=relevant_columns.keys())
    return df

# Fetch, parse, and extract data
soup = fetch_and_parse(url)
relevant_df = extract_relevant_data(soup)

print("DataFrame after extracting relevant columns:")
print(relevant_df.head())

# Convert the columns to numeric
relevant_df['G'] = pd.to_numeric(relevant_df['G'], errors='coerce')
relevant_df['PTS'] = pd.to_numeric(relevant_df['PTS'], errors='coerce')
relevant_df['AST'] = pd.to_numeric(relevant_df['AST'], errors='coerce')

# Calculate PER
relevant_df['PER'] = (relevant_df['G'] + relevant_df['PTS'] + relevant_df['AST']) / 3

# Save the DataFrame with PER to CSV
relevant_df.to_csv('wnba_per.csv', index=False)
print("Data with PER saved to 'wnba_per.csv'")

# Confirm the final DataFrame
print("Final DataFrame with PER:")
print(relevant_df.head())

# Checking data types to ensure everything is in numeric format
print("Data types in the final DataFrame:")
print(relevant_df.dtypes)

Successfully fetched the webpage: https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/
DataFrame after extracting relevant columns:
                                              Player   G   PTS  AST
0  A'ja Wilson\n                          \n\n   ...  38  26.9  2.3
1  Arike Ogunbowale\n                          \n...  38  22.2  5.1
2  Kahleah Copper\n                          \n\n...  37  21.1  2.3
3  Breanna Stewart\n                          \n\...  38  20.4  3.5
4  Napheesa Collier\n                          \n...  34  20.4  3.4
Data with PER saved to 'wnba_per.csv'
Final DataFrame with PER:
                                              Player   G   PTS  AST        PER
0  A'ja Wilson\n                          \n\n   ...  38  26.9  2.3  22.400000
1  Arike Ogunbowale\n                          \n...  38  22.2  5.1  21.766667
2  Kahleah Copper\n                          \n\n...  37  21.1  2.3  20.133333
3  Breanna Stewart\n                          \n\...  