In [1]:
# Install necessary packages (if not already installed)
!pip install pandas requests beautifulsoup4 lxml

# Import necessary libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Function to scrape player statistics from Basketball Reference
def scrape_basketball_reference(url, table_id):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Debug: print all available table IDs
    available_tables = [tbl.get('id') for tbl in soup.find_all('table')]
    print("Available table IDs for URL", url, ":", available_tables)

    # Find the table with the specified ID
    table = soup.find('table', {'id': table_id})
    if table is not None:
        df = pd.read_html(str(table))[0]
        return df

    print(f"No matching table found for the URL: {url}")
    return pd.DataFrame()

# URLs for WNBA and NBA player statistics from Basketball Reference
wnba_url = "https://www.basketball-reference.com/wnba/years/2023_per_game.html"
nba_url = "https://www.basketball-reference.com/leagues/NBA_2023_per_game.html"

# Table IDs to look for
wnba_table_id = "per_game"
nba_table_id = "per_game_stats"

# Scraping data
wnba_data = scrape_basketball_reference(wnba_url, wnba_table_id)
nba_data = scrape_basketball_reference(nba_url, nba_table_id)

# Debug: print column names of the scraped data
print("WNBA data columns:", wnba_data.columns)
print("NBA data columns:", nba_data.columns)

# Check if data was scraped successfully
if wnba_data.empty or nba_data.empty:
    print("Failed to scrape data. Please check the table IDs and URLs.")
else:
    # Function to process data
    def process_data(df, league_name):
        try:
            # Ensure columns match exactly
            columns_needed = ['Player', 'FG%', 'FT%', '3P%']
            df = df[df['Player'] != 'Player']  # Remove any rows where 'Player' column value is 'Player'
            df = df[columns_needed].dropna()

            # Debug: Print the first few rows to inspect the data
            print(f"First few rows of {league_name} data before conversion:")
            print(df.head())

            # Remove '%' and convert to numeric
            df['FG%'] = pd.to_numeric(df['FG%'].str.replace('%','').astype(float) / 100, errors='coerce')
            df['FT%'] = pd.to_numeric(df['FT%'].str.replace('%','').astype(float) / 100, errors='coerce')
            df['3P%'] = pd.to_numeric(df['3P%'].str.replace('%','').astype(float) / 100, errors='coerce')

            # Debug: Print the first few rows after conversion
            print(f"First few rows of {league_name} data after conversion:")
            print(df.head())

            df['League'] = league_name
            df = df.dropna()  # Remove rows with NaN values after conversion
        except KeyError as e:
            print(f"Error processing data for {league_name}: {e}")
            return pd.DataFrame()  # Return an empty DataFrame in case of error
        return df

    # Process the data
    wnba_data = process_data(wnba_data, 'WNBA')
    nba_data = process_data(nba_data, 'NBA')

    # Save the data to CSV files
    wnba_data.to_csv('../data/wnba_data.csv', index=False)
    nba_data.to_csv('../data/nba_data.csv', index=False)

    # Combine both datasets
    data = pd.concat([wnba_data, nba_data])

    # Save the combined data to a CSV file
    data.to_csv('../data/combined_data.csv', index=False)

    print("Data saved to CSV files.")

# Display the first few rows of the combined DataFrame
data.head()


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m




Available table IDs for URL https://www.basketball-reference.com/wnba/years/2023_per_game.html : ['per_game']


  df = pd.read_html(str(table))[0]


Available table IDs for URL https://www.basketball-reference.com/leagues/NBA_2023_per_game.html : ['per_game_stats']
WNBA data columns: Index(['Player', 'Team', 'Pos', 'G', 'MP', 'G.1', 'GS', 'MP.1', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%',
       'ORB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')
NBA data columns: Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')
First few rows of WNBA data before conversion:
Empty DataFrame
Columns: [Player, FG%, FT%, 3P%]
Index: []
First few rows of WNBA data after conversion:
Empty DataFrame
Columns: [Player, FG%, FT%, 3P%]
Index: []
First few rows of NBA data before conversion:
             Player   FG%   FT%   3P%
0  Precious Achiuwa  .485  .702  .269
1      Steven Adams  .597  .36

  df = pd.read_html(str(table))[0]


Unnamed: 0,Player,FG%,FT%,3P%,League
0,Precious Achiuwa,0.00485,0.00702,0.00269,NBA
1,Steven Adams,0.00597,0.00364,0.0,NBA
2,Bam Adebayo,0.0054,0.00806,0.00083,NBA
3,Ochai Agbaji,0.00427,0.00812,0.00355,NBA
4,Santi Aldama,0.0047,0.0075,0.00353,NBA
