In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Path to the ChromeDriver executable (ensure the path is correct and use a raw string)
webdriver_service = Service(r'C:\Users\cjh05\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe')

# Setup Chrome driver
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# URL of the page to scrape
url = "https://www.spotrac.com/nba/contracts/_/sort/length/dir/desc"

# Access the page
driver.get(url)

# Wait for the table to load
try:
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'table')))
except:
    print("Table not found on the page.")
    driver.quit()
    exit()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Close the driver
driver.quit()

# Find the table containing the salary information
table = soup.find('table', {'class': 'table'})

# Check if the table was found
if table is None:
    print("Table not found on the page.")
else:
    # Extract headers
    headers = [th.text.strip() for th in table.find_all('th')]

    # Extract rows
    rows = []
    for tr in table.find_all('tr')[1:]:  # Skip the header row
        cells = tr.find_all('td')
        row = [cell.text.strip() for cell in cells]
        rows.append(row)

    # Create a DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Save to CSV
    df.to_csv('nba_player_contracts.csv', index=False)
    print("NBA player contracts have been saved to 'nba_player_contracts.csv'")


NBA player contracts have been saved to 'nba_player_contracts.csv'


In [2]:
import pandas as pd
from nba_api.stats.endpoints import leaguedashplayerstats
import time

# Function to fetch player stats for a given season
def fetch_player_stats(season):
    # Fetching data using nba_api
    stats = leaguedashplayerstats.LeagueDashPlayerStats(season=season, season_type_all_star='Regular Season', per_mode_detailed='PerGame')
    
    # Converting to DataFrame
    df = stats.get_data_frames()[0]
    
    return df

# List to hold all season stats
all_stats = []

# Fetch stats for all seasons from 2000 to 2024
for year in range(2000, 2025):
    season = f'{year}-{str(year+1)[2:]}'
    print(f'Fetching data for season {season}...')
    try:
        season_stats = fetch_player_stats(season)
        season_stats['Season'] = season  # Add a column for the season
        all_stats.append(season_stats)
        time.sleep(1)  # Sleep to avoid hitting the API rate limit
    except Exception as e:
        print(f"Error fetching data for season {season}: {e}")

# Concatenate all seasons into a single DataFrame
all_stats_df = pd.concat(all_stats, ignore_index=True)

# Save to CSV
all_stats_df.to_csv('nba_player_stats_2000_2024.csv', index=False)
print('All player stats from 2000 to 2024 have been saved to "nba_player_stats_2000_2024.csv"')


Fetching data for season 2000-01...
Fetching data for season 2001-02...
Fetching data for season 2002-03...
Fetching data for season 2003-04...
Fetching data for season 2004-05...
Fetching data for season 2005-06...
Fetching data for season 2006-07...
Fetching data for season 2007-08...
Fetching data for season 2008-09...
Fetching data for season 2009-10...
Fetching data for season 2010-11...
Fetching data for season 2011-12...
Fetching data for season 2012-13...
Fetching data for season 2013-14...
Fetching data for season 2014-15...
Fetching data for season 2015-16...
Fetching data for season 2016-17...
Fetching data for season 2017-18...
Fetching data for season 2018-19...
Fetching data for season 2019-20...
Fetching data for season 2020-21...
Fetching data for season 2021-22...
Fetching data for season 2022-23...
Fetching data for season 2023-24...
Fetching data for season 2024-25...


  all_stats_df = pd.concat(all_stats, ignore_index=True)


All player stats from 2000 to 2024 have been saved to "nba_player_stats_2000_2024.csv"


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def fetch_player_data(season):
    url = f'https://www.basketball-reference.com/leagues/NBA_{season}_per_game.html'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the table with player stats
    table = soup.find('table', {'id': 'per_game_stats'})
    
    # Extract column headers
    headers = []
    for th in table.find('thead').findAll('tr')[0].findAll('th'):
        headers.append(th.getText())
    headers.append('Season')  # Add season column
    print(f"Headers for {season}: {headers}")  # Debugging output
    
    # Extract rows
    rows = table.find('tbody').findAll('tr')
    player_data = []
    
    for row in rows:
        if row.find('th', {'scope': 'row'}) is not None:
            player_info = [th.getText() for th in row.findAll('th')] + [td.getText() for td in row.findAll('td')]
            player_info.append(season)  # Add season data
            if len(player_info) == len(headers):  # Ensure row length matches headers length
                player_data.append(player_info)
            else:
                print(f"Row length mismatch for {season}: {player_info}")  # Debugging output
    
    # Create DataFrame and ensure columns match
    if player_data:
        df = pd.DataFrame(player_data, columns=headers)
        return df
    else:
        return pd.DataFrame(columns=headers)

# Loop through each season from 2000 to 2024
all_players = pd.DataFrame()

for year in range(2000, 2025):
    season = f'{year}'
    season_data = fetch_player_data(season)
    if not all_players.empty and not season_data.empty:
        if not all(column in season_data.columns for column in all_players.columns):
            season_data = season_data.reindex(columns=all_players.columns, fill_value='')
    all_players = pd.concat([all_players, season_data], ignore_index=True)
    print(f'Fetched data for {year}')

# Select relevant columns
relevant_columns = ['Player', 'Pos', 'Season']
all_players = all_players[relevant_columns]

# Display the first few rows
print(all_players.head())

# Save to CSV
all_players.to_csv('nba_players_positions_2000_2024.csv', index=False)


Headers for 2000: ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Season']
Fetched data for 2000
Headers for 2001: ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Season']
Fetched data for 2001
Headers for 2002: ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Season']
Fetched data for 2002
Headers for 2003: ['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Season']
Fetche

In [4]:
import requests
import pandas as pd
import time

# Base URL for NBA draft combine player anthropometric data
url = "https://stats.nba.com/stats/draftcombineplayeranthro"

# Headers for the request
headers = {
    "Referer": "https://www.nba.com/",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"
}

# Function to fetch data for a specific season
def fetch_combine_data(season):
    payload = {"LeagueID": "00", "SeasonYear": season}
    response = requests.get(url, params=payload, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch data for season {season}")
        return pd.DataFrame()

    data = response.json()
    df = pd.DataFrame(data["resultSets"][0]["rowSet"],
                      columns=data["resultSets"][0]["headers"])
    df['Season'] = season  # Add season column to DataFrame
    return df

# List of seasons to fetch data for
seasons = [f"{year}-{str(year+1)[2:]}" for year in range(2000, 2025)]
all_data = []

# Fetch data for each season
for season in seasons:
    print(f"Fetching data for season {season}")
    season_data = fetch_combine_data(season)
    if not season_data.empty:
        all_data.append(season_data)
    time.sleep(1)  # Be respectful to the server

# Combine all season data into a single DataFrame
combined_df = pd.concat(all_data, ignore_index=True)

# Save to CSV
combined_df.to_csv('nba_draft_combine_anthro_2000_2024.csv', index=False)
print("Data fetching complete and saved to 'nba_draft_combine_anthro_2000_2024.csv'.")


Fetching data for season 2000-01
Fetching data for season 2001-02
Fetching data for season 2002-03
Fetching data for season 2003-04
Fetching data for season 2004-05
Fetching data for season 2005-06
Fetching data for season 2006-07
Fetching data for season 2007-08
Fetching data for season 2008-09
Fetching data for season 2009-10
Fetching data for season 2010-11
Fetching data for season 2011-12
Fetching data for season 2012-13
Fetching data for season 2013-14
Fetching data for season 2014-15
Fetching data for season 2015-16
Fetching data for season 2016-17
Fetching data for season 2017-18
Fetching data for season 2018-19
Fetching data for season 2019-20
Fetching data for season 2020-21
Fetching data for season 2021-22
Fetching data for season 2022-23
Fetching data for season 2023-24
Fetching data for season 2024-25
Data fetching complete and saved to 'nba_draft_combine_anthro_2000_2024.csv'.


  combined_df = pd.concat(all_data, ignore_index=True)
