In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extraction_webscrapping(url, output_file, header_tag, keep_columns=None):
    response = requests.get(url)
    if response.status_code == 200:
        print("Successfully fetched the webpage.")
    else:
        print(f"Failed to fetch the webpage: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    
    headers = []
    for th in table.find('thead').find_all(header_tag):
        headers.append(th.text.strip())
    print(f"Headers found: {headers}")
    
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        cells = tr.find_all('td')
        row = [cell.text.strip() for cell in cells]
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=headers)
    
    if keep_columns:
        print(f"Filtering to keep columns: {keep_columns}")
        df = df[keep_columns]
    
    df.to_csv(output_file, index=False)
    print(f"Data successfully scraped and saved to {output_file}")

def extract_wnba_player_salaries(url, output_file):
    response = requests.get(url)
    if response.status_code == 200:
        print("Successfully fetched the webpage.")
    else:
        print(f"Failed to fetch the webpage: {response.status_code}")

    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    
    headers = ["Player", "2024 Salary"]
    rows = []
    
    for tr in table.find('tbody').find_all('tr'):
        cells = tr.find_all('td')
        name = cells[0].text.strip().split('\n')[0]  # Only take first part for the name
        salary = cells[1].text.strip().split()[0]    # Only take the salary amount
        row = [name, salary]
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=headers)
    df.to_csv(output_file, index=False)
    print(f"Data successfully scraped and saved to {output_file}")

# NBA Player Salaries
extraction_webscrapping("https://hoopshype.com/salaries/players/2023-2024/", 'NEW_nba_player_salaries.csv', 'td')

# NBA Team Salaries
extraction_webscrapping("https://hoopshype.com/salaries/2023-2024/", 'NEW_nba_team_salaries.csv', 'td')

# WNBA Player Salaries (Name and Salary only)
extract_wnba_player_salaries("https://herhoopstats.com/salary-cap-sheet/wnba/players/salary_2024/stats_2024/", 'NEW_wnba_player_salaries.csv')

# WNBA Team Salaries (Keep only "Team" and "Total Salaries" columns)
extraction_webscrapping("https://herhoopstats.com/salary-cap-sheet/wnba/summary/2024/", 'NEW_wnba_team_salaries.csv', 'th', keep_columns=['Team', 'Total Salaries'])

Successfully fetched the webpage.
Headers found: ['', 'Player', '2023/24', '2023/24(*)']
Data successfully scraped and saved to NEW_nba_player_salaries.csv
Successfully fetched the webpage.
Headers found: ['', 'Team', '2023/24', '2023/24(*)']
Data successfully scraped and saved to NEW_nba_team_salaries.csv
Successfully fetched the webpage.
Data successfully scraped and saved to NEW_wnba_player_salaries.csv
Successfully fetched the webpage.
Headers found: ['Team', 'Total Salaries', 'Total Players', 'Cap Room', 'Guaranteed Salaries']
Filtering to keep columns: ['Team', 'Total Salaries']
Data successfully scraped and saved to NEW_wnba_team_salaries.csv
