In [None]:
import requests
from bs4 import BeautifulSoup
import time

# pass a url and file save path to save all html data to a file
def get_html(url, path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(path, 'w', encoding='utf-8') as file:
            file.write(response.text)
    else: 
        print("Failed to retrieve the page. Status code:", response.status_code)
        
# pass alphabet range and year range for search for player metrics/statistics
def find_players(start_letter, end_letter):
    all_players_url = "https://www.basketball-reference.com/players/"
    # create array of letters
    alphabet_range = [chr(i) for i in range(ord(start_letter), ord(end_letter) + 1)]

    for letter in alphabet_range:
        # concatenate each letter to make the new url's to browse through all players
        new_url = f"https://www.basketball-reference.com/players/{letter}"
        # pass the url information to save the html data
        get_html(new_url, rf'C:\Users\Michael\Code\Python\Data_scraping\player_page_html\letter_{letter}_data.html')
        # sets a delay of a few seconds to try and space the number of requests to avoid 426 error
        time.sleep(10)

In [3]:
import requests
from bs4 import BeautifulSoup
import time

# find players based on the seasons that they have played, pulling from html data already saved using the find_players function
def find_players_by_year(start_letter, end_letter, start_year, end_year):
    # create array of letters
    alphabet_range = [chr(i) for i in range(ord(start_letter), ord(end_letter) + 1)]
    
    # empty array to collect player names
    player_names_with_url = []
    
    # take data from saved location
    for letter in alphabet_range:
        # open file containing the HTML data
        with open(rf'C:\Users\Michael\Code\Python\Data_scraping\alphabetic_players_grouped\letter_{letter}_data', 'r', encoding='utf-8') as file:
            contents = file.read()
        
        soup = BeautifulSoup(contents, "html.parser")   
        table = soup.find("table", id="players").find("tbody")
        
        # iterates through the table data for players of a given last name starting letter
        for element in table.find_all("tr"):
            # gets the years played bounds
            lower_year_bound = int(element.find('td', {'data-stat': 'year_min'}).get_text())
            upper_year_bound = int(element.find('td', {'data-stat': 'year_max'}).get_text())
            
            # checks the years played by the player
            if (lower_year_bound >= start_year or upper_year_bound >= start_year) and ( lower_year_bound <= end_year) :
                # if the player played during the years specified, add to the list
                player_names_with_url.append([element.find('th', {'data-stat': 'player'}).find('a').get_text(), element.find('th', {'data-stat': 'player'}).find('a')['href']])
    return player_names_with_url
                
player_list = find_players_by_year('a', 'b', 1980, 1981)
print(player_list)

[['Kareem Abdul-Jabbar', '/players/a/abdulka01.html'], ['Tom Abernethy', '/players/a/abernto01.html'], ['Alvan Adams', '/players/a/adamsal01.html'], ['Darrell Allums', '/players/a/allumda01.html'], ['Tiny Archibald', '/players/a/architi01.html'], ['Dennis Awtrey', '/players/a/awtrede01.html'], ['Gus Bailey', '/players/b/bailegu01.html'], ['James Bailey', '/players/b/baileja01.html'], ['Greg Ballard', '/players/b/ballagr01.html'], ['Mike Bantom', '/players/b/bantomi01.html'], ['Marvin Barnes', '/players/b/barnema01.html'], ['Rick Barry', '/players/b/barryri01.html'], ['Tim Bassett', '/players/b/basseti01.html'], ['Billy Ray Bates', '/players/b/batesbi01.html'], ['Ron Behagen', '/players/b/behagro01.html'], ['Mel Bennett', '/players/b/benneme01.html'], ['Kent Benson', '/players/b/bensoke01.html'], ['Del Beshore', '/players/b/beshode01.html'], ['Henry Bibby', '/players/b/bibbyhe01.html'], ['Larry Bird', '/players/b/birdla01.html'], ['Otis Birdsong', '/players/b/birdsot01.html'], ['Norman 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

# find players based on the seasons that they have played, pulling from html data already saved using the find_players function
def find_players_by_year(start_letter: str, end_letter: str, start_year: int, end_year: int) -> list :
    # create array of letters
    alphabet_range = [chr(i) for i in range(ord(start_letter), ord(end_letter) + 1)]
    
    # empty array to collect player names
    player_names_with_url = []
    
    # take data from saved location
    for letter in alphabet_range:
        
        # open file containing the HTML data (with error handles)
        with open(rf'C:\Users\Michael\Code\Python\Data_scraping\alphabetic_players_grouped\letter_{letter}_data', 'r', encoding='utf-8') as file:
            contents = file.read()
        
        soup = BeautifulSoup(contents, "html.parser")
        table = soup.find("table", id="players").find("tbody")
        
        # iterates through the table data for players of a given last name starting letter
        for element in table.find_all("tr"):
            # gets the years played bounds
            lower_year_bound = int(element.find('td', {'data-stat': 'year_min'}).get_text())
            upper_year_bound = int(element.find('td', {'data-stat': 'year_max'}).get_text())
            
            # checks the years played by the player
            if (lower_year_bound >= start_year or upper_year_bound >= start_year) and ( lower_year_bound <= end_year) :
                # if the player played during the years specified, create a list containing the name and url, then add that list to player_names_with_url
                player_names_with_url.append([element.find('th', {'data-stat': 'player'}).find('a').get_text(), element.find('th', {'data-stat': 'player'}).find('a')['href']])
    
    # returns a list containing the player name with part of the url to navigate to their data page
    return player_names_with_url

# retrieve the player metrics by passing the list containing the names and url
def get_player_metrics(player_names_with_url: list) -> DataFrame:
    
    baseline_url = "https://www.basketball-reference.com"
    
    for player in player_names_with_url: 
        # pass the full url after appending to the end of the baseline url from list
        page_data = get_response_data(f'{baseline_url}{player[1]}')
        # make the soup
        soup = BeautifulSoup(page_data, 'html.parser')
        
        # find specific branch of HTML data
        full_player_stats = soup.find('div', id='meta')
        
        # retrieve the str of data, ensure a space between lines/words, replace strange characters 
        player_metric_string = full_player_stats.get_text(separator=" ").replace(u'\xa0', ' ').replace(u'\u25aa', '').strip()
        
        # normalize whitespace to a single space; split() breaks the string into words by whitespace; join() fuses them back together with only a single whitespace between each word
        single_line_output = ' '.join(player_metric_string.split())
        
        # array of strings containing player info, starting with their name
        player_metrics = [player[0]]
        
        # use re to collect the text between "Position:" and "Shoots:" to get player position
        position = re.search(r'Position:\s*(.*?)\s*Shoots:', single_line_output)
        if position:
            # assigns the first instance this pattern is found within the given string
            player_metrics.append(position.group(1))
        
        # use re to collect the text immediately after "Shoots:" for player dominant shooting hand
        shoots = re.search(r'Shoots:\s*(\w+)', single_line_output)
        if shoots:
            player_metrics.append(shoots.group(1))
        
        # use re to collect the number for height
        height = re.search(r'(\d+)cm', single_line_output)
        if height:
            player_metrics.append(height.group(1))
        
        # use re to collect the number for weight
        weight = re.search(r'(\d+)kg', single_line_output)
        if weight:
            player_metrics.append(weight.group(1))
        
        # use re to collect the text for college
        college = re.search(r'College:\s*(\w+)', single_line_output)
        if college:
            player_metrics.append(college.group(1))
        else:
            player_metrics.append("n/a")
        
        player_metric_headers = ['Name','Position','Shoots', 'Height', 'Weight', 'College']
        
        # create the DataFrame containing player info
        player_metrics_df = pd.DataFrame([player_metrics], columns=player_metric_headers)
        
    return player_metrics_df


player_list = find_players_by_year('a', 'b', 1980, 1985)
data_frame = get_player_metrics(player_list)
#print(player_list)

NameError: name 'DataFrame' is not defined