In [53]:
'''
Final Project Tutorial

Joed Quaye
Ronald Chomnou
Mark Spooner
Griffin Araujo
'''
# necessary imports

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import requests
import numpy as np
from functools import reduce
import pandas as pd

# looking at the past 7 seasons for data analysis
URL2324 = "https://www.basketball-reference.com/leagues/NBA_2024_per_game.html"
URL2223 = "https://www.basketball-reference.com/leagues/NBA_2023_per_game.html"
URL2122 = "https://www.basketball-reference.com/leagues/NBA_2022_per_game.html"
URL2021 = "https://www.basketball-reference.com/leagues/NBA_2021_per_game.html"
URL1920 = "https://www.basketball-reference.com/leagues/NBA_2020_per_game.html"
URL1819 = "https://www.basketball-reference.com/leagues/NBA_2019_per_game.html"
URL1718 = "https://www.basketball-reference.com/leagues/NBA_2018_per_game.html"

In [64]:
# function takes in a URL and returns a dictionary with corresponding data as output 
def data_scrape(URL):
    # new webdriver
    driver = webdriver.Safari()
    driver.get(URL)

    # reading the data as HTML
    html = BeautifulSoup(driver.page_source, 'html.parser')
    table = html.find('table', {'id': 'per_game_stats'})

    player_stats = {}
    temp = {} 
    usedTOT = False

    body = table.find('tbody')
    rows = body.find_all('tr')

    # some are duplicates
    for row in rows:
        try:
            # cells obtains all the column data of each row 
            cells = row.find_all('td')
            # for each row, get appropriate stat (according to column location) and append 
            # first instance of td is the names 
            player_name = cells[0].text.strip()

            # taking into account whether the player played for multiple teams 
            if cells[3].text == 'TOT': 
                temp[player_name] = {}
                temp[player_name]["position"] = cells[1].text.strip()
                temp[player_name]["age"] = cells[2].text.strip()
                temp[player_name]["games played"] = cells[4].text.strip()
                temp[player_name]["games started"] = cells[5].text.strip()
                temp[player_name]["minutes played per game"] = cells[6].text.strip()
                temp[player_name]["field goals"] = cells[7].text.strip()
                temp[player_name]["field goal attempts"] = cells[8].text.strip()
                temp[player_name]["fg percentage"] = cells[9].text.strip()
                temp[player_name]["3pt per game"] = cells[10].text.strip()
                temp[player_name]["3pt attempts"] = cells[11].text.strip()
                temp[player_name]["3pt percentage"] = cells[12].text.strip()
                temp[player_name]["2pt per game"] = cells[13].text.strip()
                temp[player_name]["2pt attempts"] = cells[14].text.strip()
                temp[player_name]["2pt percentage"] = cells[15].text.strip()
                temp[player_name]["effective fg percentage"] = cells[16].text.strip()
                temp[player_name]["free throws"] = cells[17].text.strip()
                temp[player_name]["free throw attempts"] = cells[18].text.strip()
                temp[player_name]["free throw percentage"] = cells[19].text.strip()
                temp[player_name]["offensive rebounds"] = cells[20].text.strip()
                temp[player_name]["defensive rebounds"] = cells[21].text.strip()
                temp[player_name]["total rebounds"] = cells[22].text.strip()
                temp[player_name]["assists"] = cells[23].text.strip()
                temp[player_name]["steals"] = cells[24].text.strip()
                temp[player_name]["blocks"] = cells[25].text.strip()
                temp[player_name]["turnovers"] = cells[26].text.strip()
                temp[player_name]["personal fouls"] = cells[27].text.strip()
                temp[player_name]["ppg"] = cells[28].text.strip()
                usedTOT = True
                continue 
            # taking into account whether the person played for multiple teams (only keeping first)
            if player_name in player_stats:
                continue
            player_stats[player_name] = {}
            player_stats[player_name]["position"] = temp[player_name]["position"] if usedTOT else cells[1].text.strip()
            player_stats[player_name]["age"] = temp[player_name]["age"] if usedTOT else cells[2].text.strip()
            player_stats[player_name]["team"] = cells[3].text.strip()
            player_stats[player_name]["games played"] = temp[player_name]["games played"] if usedTOT else cells[4].text.strip()
            player_stats[player_name]["games started"] = temp[player_name]["games started"] if usedTOT else cells[5].text.strip()
            player_stats[player_name]["minutes played per game"] = temp[player_name]["minutes played per game"] if usedTOT else cells[6].text.strip()
            player_stats[player_name]["field goals"] = temp[player_name]["field goals"] if usedTOT else cells[7].text.strip()
            player_stats[player_name]["field goal attempts"] = temp[player_name]["field goal attempts"] if usedTOT else cells[8].text.strip()
            player_stats[player_name]["fg percentage"] = temp[player_name]["fg percentage"] if usedTOT else cells[9].text.strip()
            player_stats[player_name]["3pt per game"] = temp[player_name]["3pt per game"] if usedTOT else cells[10].text.strip()
            player_stats[player_name]["3pt attempts"] = temp[player_name]["3pt attempts"] if usedTOT else cells[11].text.strip()
            player_stats[player_name]["3pt percentage"] = temp[player_name]["3pt percentage"] if usedTOT else cells[12].text.strip()
            player_stats[player_name]["2pt per game"] = temp[player_name]["2pt per game"] if usedTOT else cells[13].text.strip()
            player_stats[player_name]["2pt attempts"] = temp[player_name]["2pt attempts"] if usedTOT else cells[14].text.strip()
            player_stats[player_name]["2pt percentage"] = temp[player_name]["2pt percentage"] if usedTOT else cells[15].text.strip()
            player_stats[player_name]["effective fg percentage"] = temp[player_name]["effective fg percentage"] if usedTOT else cells[16].text.strip()
            player_stats[player_name]["free throws"] = temp[player_name]["free throws"] if usedTOT else cells[17].text.strip()
            player_stats[player_name]["free throw attempts"] = temp[player_name]["free throw attempts"] if usedTOT else cells[18].text.strip()
            player_stats[player_name]["free throw percentage"] = temp[player_name]["free throw percentage"] if usedTOT else cells[19].text.strip()
            player_stats[player_name]["offensive rebounds"] = temp[player_name]["offensive rebounds"] if usedTOT else cells[20].text.strip()
            player_stats[player_name]["defensive rebounds"] = temp[player_name]["defensive rebounds"] if usedTOT else cells[21].text.strip()
            player_stats[player_name]["total rebounds"] = temp[player_name]["total rebounds"] if usedTOT else cells[22].text.strip()
            player_stats[player_name]["assists"] = temp[player_name]["assists"] if usedTOT else cells[23].text.strip()
            player_stats[player_name]["steals"] = temp[player_name]["steals"] if usedTOT else cells[24].text.strip()
            player_stats[player_name]["blocks"] = temp[player_name]["blocks"] if usedTOT else cells[25].text.strip()
            player_stats[player_name]["turnovers"] = temp[player_name]["turnovers"] if usedTOT else cells[26].text.strip()
            player_stats[player_name]["personal fouls"] = temp[player_name]["personal fouls"] if usedTOT else cells[27].text.strip()
            player_stats[player_name]["ppg"] = temp[player_name]["ppg"] if usedTOT else cells[28].text.strip()
            usedTOT = False
        except:
            continue

    driver.quit()
    # returning the player stats
    return player_stats

# obtaing all season data
first_season = data_scrape(URL2324)
second_season = data_scrape(URL2223)
third_season = data_scrape(URL2122)
fourth_season = data_scrape(URL2021)
fifth_season = data_scrape(URL1920)
sixth_season = data_scrape(URL1819)
seventh_season = data_scrape(URL1718)


In [76]:
# creating dataframe based off created dictionary
data2324 = pd.DataFrame.from_dict(first_season, orient='index')
data2223 = pd.DataFrame.from_dict(second_season, orient='index')
data2122 = pd.DataFrame.from_dict(third_season, orient='index')
data2021 = pd.DataFrame.from_dict(fourth_season, orient='index')
data1920 = pd.DataFrame.from_dict(fifth_season, orient='index')
data1819 = pd.DataFrame.from_dict(sixth_season, orient='index')
data1718 = pd.DataFrame.from_dict(seventh_season, orient='index')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Reset display options to default
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

count = 24
# now printing dataframe data
def data_display(dataframe):
    global count
    print("\n" + "20" + str(count - 1) + "-" + str(count) + " SEASON")
    print(dataframe.head())
    print(dataframe.tail())
    count -= 1
    
data_display(data2324)
data_display(data2223)
data_display(data2122)
data_display(data2021)
data_display(data1920)
data_display(data1819)
data_display(data1718)



2023-24 Season
                         position age team games played games started  \
Precious Achiuwa             PF-C  24  TOR           74            18   
Bam Adebayo                     C  26  MIA           71            71   
Ochai Agbaji                   SG  23  UTA           78            28   
Santi Aldama                   PF  23  MEM           61            35   
Nickeil Alexander-Walker       SG  25  MIN           82            20   

                         minutes played per game field goals  \
Precious Achiuwa                            21.9         3.2   
Bam Adebayo                                 34.0         7.5   
Ochai Agbaji                                21.0         2.3   
Santi Aldama                                26.5         4.0   
Nickeil Alexander-Walker                    23.4         2.9   

                         field goal attempts fg percentage 3pt per game  \
Precious Achiuwa                         6.3          .501          0.4   
Bam Adebay

In [None]:
'''
IGNORE THIS BLOCK 
'''
# adding columns to the dataframe manually 

# Add columns to the DataFrame
# data['Player_Name'] = []
# data['Age'] = []
# data['Position'] = []
# data['Team'] = []
# data['Total_Games'] = []
# data['Games_Started'] = []
# data['Minutes_Per_Game'] = []
# data['FG'] = []
# data['FGA'] = []
# data['FGP'] = []
# data['3P'] = []
# data['3PA'] = []
# data['3PP'] = []
# data['2P'] = []
# data['2PA'] = []
# data['2PP'] = []
# data['eFGP'] = []
# data['FT'] = []
# data['FTA'] = []
# data['FTP'] = []
# data['O_Rebounds'] = []
# data['D_Rebounds'] = []
# data['T_Rebounds'] = []
# data['Assists'] = []
# data['Steals'] = []
# data['Blocks'] = []
# data['Turnovers'] = []
# data['Personal_Fouls'] = []
# data['PPG'] = []

# for player, stats in player_stats.items():
#     # Add rows to the DataFrame
#     new_row = pd.DataFrame({'Player_Name': player, 'Age': stats["age"], 'Position': stats["position"], 'Team': stats["team"], 'Total_Games': stats["games played"],
#                'Games_Started': stats["games started"], 'Minutes_Per_Game': stats["minutes played per game"], 'FG': stats["field goals"], 'FGA': stats["field goal attempts"],
#                'FGP': stats["fg percentage"], '3P': stats["3pt per game"], '3PA': stats["3pt attempts"], '3PP': stats["3pt percentage"],
#                '2P': stats["2pt per game"], '2PA': stats["2pt attempts"], '2PP': stats["2pt attempts"], 'eFGP': stats["effective fg percentage"],
#                'FT': stats["free throws"], 'FTA': stats["free throw attempts"], 'FTP': stats["free throw percentage"], 'O_Rebounds': stats["offensive rebounds"],
#                'D_Rebounds': stats["defensive rebounds"], 'T_Rebounds': stats["total rebounds"], 'Assists': stats["assists"], 'Steals': stats["steals"],
#                'Blocks': stats["blocks"], 'Turnovers': stats["turnovers"], 'Personal_Fouls': stats["personal fouls"], 'PPG': stats["ppg"]
#                })
#     data = data.append(new_row, ignore_index=True)