In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import lxml
import numpy as np


In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2023_per_game.html'

page = requests.get(url)

soup = BeautifulSoup(page.text, "lxml")


In [17]:
table = soup.find('table', id='per_game_stats')

headers = [th.getText() for th in table.find_all('tr', limit=2)[0].find_all('th')]

#Add underscores to strings that start with a number as those strings are not compatible with BigQuery
for i, header in enumerate(headers):
    if header[0].isnumeric() == True:
        headers[i] = '_' + header
headers.pop(0)
print(headers)



['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '_3P', '_3PA', '_3P%', '_2P', '_2PA', '_2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']


In [22]:
#print(table)
rows = table.find_all('tr')[1:]
rows_data = [[td.getText() for td in rows[i].find_all('td')]
                    for i in range(len(rows))]

rows_data = [x for x in rows_data if x]

print(rows_data[19:26])



[['Mo Bamba', 'C', '24', 'ORL', '7', '0', '13.4', '2.1', '4.9', '.441', '0.7', '2.6', '.278', '1.4', '2.3', '.625', '.515', '0.4', '0.7', '.600', '0.4', '2.7', '3.1', '0.9', '0.1', '0.6', '0.6', '2.1', '5.4'], ['Paolo Banchero', 'PF', '20', 'ORL', '7', '7', '33.1', '7.6', '17.3', '.438', '1.3', '4.4', '.290', '6.3', '12.9', '.489', '.475', '6.3', '8.6', '.733', '1.1', '6.4', '7.6', '3.9', '0.6', '1.1', '2.6', '2.4', '22.7'], ['Desmond Bane', 'SG', '24', 'MEM', '6', '6', '33.0', '8.2', '18.5', '.441', '4.2', '9.3', '.446', '4.0', '9.2', '.436', '.554', '3.7', '4.3', '.846', '0.5', '4.2', '4.7', '5.0', '0.8', '0.5', '2.2', '2.7', '24.2'], ['Dalano Banton', 'PG', '23', 'TOR', '7', '0', '7.7', '1.1', '2.6', '.444', '0.3', '1.1', '.250', '0.9', '1.4', '.600', '.500', '0.3', '0.3', '1.000', '0.3', '0.7', '1.0', '0.6', '0.3', '0.3', '0.3', '0.6', '2.9'], ['Harrison Barnes', 'PF', '30', 'SAC', '6', '6', '32.7', '3.3', '8.3', '.400', '0.3', '3.0', '.111', '3.0', '5.3', '.563', '.420', '4.3', '5

In [25]:
nba_players = pd.DataFrame(rows_data, columns=headers)
nba_players.index = np.arange(1, len(nba_players) + 1)


In [26]:


nba_players.to_csv("nba_players_per_game.csv", index=False)

In [7]:
df = pd.read_csv('nba_players_per_game.csv')

In [8]:
from sqlalchemy import create_engine

In [10]:
engine = create_engine('postgresql://root:root@localhost:5432/nba_rookies')

In [11]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x12346ea70>

In [12]:
print(pd.io.sql.get_schema(df, name='nba_rookie_data', con=engine))


CREATE TABLE nba_rookie_data (
	"Player" TEXT, 
	"Yrs" BIGINT, 
	"G" BIGINT, 
	"MP" BIGINT, 
	"FG" BIGINT, 
	"FGA" BIGINT, 
	"3P" BIGINT, 
	"3PA" BIGINT, 
	"FT" BIGINT, 
	"FTA" BIGINT, 
	"ORB" BIGINT, 
	"TRB" BIGINT, 
	"AST" BIGINT, 
	"STL" BIGINT, 
	"BLK" BIGINT, 
	"TOV" BIGINT, 
	"PF" BIGINT, 
	"PTS" BIGINT, 
	"FG%%" FLOAT(53), 
	"3P%%" FLOAT(53), 
	"FT%%" FLOAT(53), 
	"MPG" FLOAT(53), 
	"PPG" FLOAT(53), 
	"RPG" FLOAT(53), 
	"APG" FLOAT(53)
)




In [13]:
df.to_sql(con=engine, name='nba_rookie_data', if_exists='replace')

47