# Web Scraping NBA Data

The goal of this Notebook is to web scrape NBA player data from the Basketball Reference website (www.basketball-reference.com) to then later perform analysis on.

In [1]:
# Import the necessary libraries for Web Scraping the NBA player data
import sys
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

## Initial Testing

The base url that I use to access the player data is https://www.basketball-reference.com/players. The final code at the end of this notebook loops over each letter of the alphabet, representing the letter that a player's last name starts with, and then adds that to the end of the url. The url for each letter of the alphabet provides biographic data for each player such as birthdate, college attended, and position played. To start, however, I only test for last names starting with the letter 'A'.

In [2]:
# Establish the url with the data
first_url = 'https://www.basketball-reference.com/players/a'

In [3]:
# Perform a get request for the website
first_page = requests.get(first_url)

In [4]:
# Use BeautifulSoup to parse the HTML data
first_soup = BeautifulSoup(first_page.content, 'html.parser')
# The following variable will contain the HTML code that has the NBA player data
first_table = first_soup.find_all('tr')

In [5]:
# Use the BeautifulSoup object to create column names for the DataFrame
first_head = first_soup.find('thead')
first_columns_raw = [first_head.text for item in first_head][0]
# Then clean up the column names and only save the relevant ones
first_columns = first_columns_raw.replace('\n', ',').split(',')[2:-2]
first_columns

['Player', 'From', 'To', 'Pos', 'Ht', 'Wt', 'Birth Date', 'Colleges']

After creating the BeautifulSoup object for the url with last names beginning with the letter 'A', I run a test for a specific player and his url, which provides career statistics like games played, points per game, and Player Efficiency Rating (PER).

In [6]:
# Code is for the specific url for Alaa Abdelnaby
player_url = f'https://www.basketball-reference.com/players/a/abdelal01.html'
player_page = requests.get(player_url)
player_soup = BeautifulSoup(player_page.content, 'html.parser')
player_target = player_soup.find(class_='stats_pullout')
player_career_stats = []
career_columns = []
# Loop through the target HTML data to extract relevant player data
for i, div in enumerate(player_target.find_all('div')):
  if i < 2:
    continue
  else:
    column_names = div.find('strong').text # extract column names
    val = div.find_all('p') # extract career statistics
    stats = val[1].text
  player_career_stats.append(stats)
  career_columns.append(column_names)

After working on the code for one specific player and the corresponding url, I test for each player who's last name begins with the letter 'A'.

As a reminder, I get biographic data from the url with all the players who's last names start with a certain letter, and I get career statistics from each players specific url. I then combine the data from the 2 separate url's into one table for each player.

In [7]:
# Loop through the table to extract the data for each player
first_players = []
for i in range(len(first_table)):
  if i == 0:
    continue
  elif i == 1: # I include the elif statement to only grab the column names once from the players url
    # The following is setting up to grab the career statistics
    player = []
    player_name = []
    player_stats = []
    player_link = first_table[i].find('a', href=True)['href']
    player_url = f'https://www.basketball-reference.com/{player_link}'
    player_page = requests.get(player_url)
    player_soup = BeautifulSoup(player_page.content, 'html.parser')
    player_target = player_soup.find(class_='stats_pullout')
    player_career_stats = []
    career_columns = []
    # The following loop is to actually grab the career statistics and store them
    for j, div in enumerate(player_target.find_all('div')):
      if j < 2:
        continue
      else:
        column_names = div.find('strong').text 
        val = div.find_all('p')
        stats = val[1].text
      player_career_stats.append(stats)
      career_columns.append(column_names)
    # The following loops grab the players names and biographic data
    for th in first_table[i].find_all('th'):
      player_name.append(th.text)
    for td in first_table[i].find_all('td'):
      player_stats.append(td.text)
    player = player_name + player_stats + player_career_stats
    first_players.append(player)
  else: 
    player = []
    player_name = []
    player_stats = []
    player_link = first_table[i].find('a', href=True)['href']
    player_url = f'https://www.basketball-reference.com/{player_link}'
    player_page = requests.get(player_url)
    player_soup = BeautifulSoup(player_page.content, 'html.parser')
    player_target = player_soup.find(class_='stats_pullout')
    player_career_stats = []
    # The following loop is to actually grab the career statistics and store them
    for j, div in enumerate(player_target.find_all('div')):
      if j < 2:
        continue
      else:
        val = div.find_all('p')
        stats = val[1].text
      player_career_stats.append(stats)
    # The following loops grab the players names and biographic data
    for th in first_table[i].find_all('th'):
      player_name.append(th.text)
    for td in first_table[i].find_all('td'):
      player_stats.append(td.text)
    player = player_name + player_stats + player_career_stats
    first_players.append(player)
# Create a Pandas DataFrame of the player data
full_columns = first_columns + career_columns
first_df = pd.DataFrame(first_players, columns=full_columns)
first_df.head()

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,G,G.1,...,TRB,AST,FG%,FG%.1,FG3%,FT%,eFG%,PER,PER.1,WS
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240,"June 24, 1968",Duke,256,256,...,3.3,0.3,50.2,50.2,0.0,70.1,50.2,13.0,13.0,4.8
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235,"April 7, 1946",Iowa State,505,505,...,8.0,1.2,42.8,42.8,72.8,15.1,15.1,17.5,,
2,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225,"April 16, 1947",UCLA,1560,1560,...,11.2,3.6,55.9,55.9,5.6,72.1,55.9,24.6,24.6,273.4
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162,"March 9, 1969",LSU,586,586,...,1.9,3.5,44.2,44.2,35.4,90.5,47.2,15.4,15.4,25.2
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223,"November 3, 1974","Michigan, San Jose State",236,236,...,3.3,1.1,41.7,41.7,23.7,70.3,42.2,11.4,11.4,3.5


In [8]:
# Remove the duplicate columns
first_df = first_df.loc[:,~first_df.columns.duplicated()]
first_df.head()

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,G,PTS,TRB,AST,FG%,FG3%,FT%,eFG%,PER,WS
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240,"June 24, 1968",Duke,256,5.7,3.3,0.3,50.2,0.0,70.1,50.2,13.0,4.8
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235,"April 7, 1946",Iowa State,505,9.0,8.0,1.2,42.8,72.8,15.1,15.1,17.5,
2,Kareem Abdul-Jabbar*,1970,1989,C,7-2,225,"April 16, 1947",UCLA,1560,24.6,11.2,3.6,55.9,5.6,72.1,55.9,24.6,273.4
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162,"March 9, 1969",LSU,586,14.6,1.9,3.5,44.2,35.4,90.5,47.2,15.4,25.2
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223,"November 3, 1974","Michigan, San Jose State",236,7.8,3.3,1.1,41.7,23.7,70.3,42.2,11.4,3.5


Since the test on players with a last name starting with the letter 'A' was successful, I move on to run the code for every letter of the alphabet. 

## Collect Data for Every Player

In [9]:
players = [] # initialize list to save the player data to
for x in range(97,123): # loop over every letter of the alphabet
  letter = chr(x)
  url = f'https://www.basketball-reference.com/players/{letter}'
  page = requests.get(url)
  soup = BeautifulSoup(page.content, 'html.parser')
  table = soup.find_all('tr')
  for i in range(len(table)):
    if i == 0:
      continue
    else: 
      player = []
      player_name = []
      player_stats = []
      player_link = table[i].find('a', href=True)['href']
      player_url = f'https://www.basketball-reference.com/{player_link}'
      player_page = requests.get(player_url)
      player_soup = BeautifulSoup(player_page.content, 'html.parser')
      player_target = player_soup.find(class_='stats_pullout')
      player_career_stats = []
      for j, div in enumerate(player_target.find_all('div')):
        if j < 2:
          continue
        else:
          val = div.find_all('p')
          stats = val[1].text
        player_career_stats.append(stats)
      for th in table[i].find_all('th'):
        player_name.append(th.text)
      for td in table[i].find_all('td'):
        player_stats.append(td.text)
      player = player_name + player_stats + player_career_stats
      players.append(player)

players_df = pd.DataFrame(players, columns=full_columns)
players_df.tail()

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,G,G.1,...,TRB,AST,FG%,FG%.1,FG3%,FT%,eFG%,PER,PER.1,WS
5018,Ante Žižić,2018,2020,F-C,6-10,266,"January 4, 1997",,113,113,...,3.9,0.6,58.1,58.1,-,71.1,58.1,17.4,17.4,3.5
5019,Jim Zoet,1983,1983,C,7-1,240,"December 20, 1953",Kent State University,7,7,...,1.1,0.1,20.0,20.0,-,-,20.0,-0.8,-0.8,-0.1
5020,Bill Zopf,1971,1971,G,6-1,170,"June 7, 1948",Duquesne,53,53,...,0.9,1.4,36.3,36.3,55.6,9.6,9.6,-0.1,,
5021,Ivica Zubac,2017,2022,C,7-0,240,"March 18, 1997",,360,360,...,6.5,1.1,59.7,59.7,10.0,75.4,59.7,19.2,19.2,26.1
5022,Matt Zunic,1949,1949,G-F,6-3,195,"December 19, 1919",George Washington,56,56,...,-,0.9,30.3,30.3,70.6,-,-,2.0,,


In [10]:
# Remove duplicate columns in the DataFrame
players_df = players_df.loc[:,~players_df.columns.duplicated()]
players_df.tail()

Unnamed: 0,Player,From,To,Pos,Ht,Wt,Birth Date,Colleges,G,PTS,TRB,AST,FG%,FG3%,FT%,eFG%,PER,WS
5018,Ante Žižić,2018,2020,F-C,6-10,266,"January 4, 1997",,113,6.0,3.9,0.6,58.1,-,71.1,58.1,17.4,3.5
5019,Jim Zoet,1983,1983,C,7-1,240,"December 20, 1953",Kent State University,7,0.3,1.1,0.1,20.0,-,-,20.0,-0.8,-0.1
5020,Bill Zopf,1971,1971,G,6-1,170,"June 7, 1948",Duquesne,53,2.2,0.9,1.4,36.3,55.6,9.6,9.6,-0.1,
5021,Ivica Zubac,2017,2022,C,7-0,240,"March 18, 1997",,360,8.3,6.5,1.1,59.7,10.0,75.4,59.7,19.2,26.1
5022,Matt Zunic,1949,1949,G-F,6-3,195,"December 19, 1919",George Washington,56,4.9,-,0.9,30.3,70.6,-,-,2.0,


In [11]:
# Save the data to a csv file to access later
players_df.to_csv('/content/drive/MyDrive/NBA_players_data.csv')