In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import lxml
import numpy as np


In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2023_rookies.html#rookies'

page = requests.get(url)

soup = BeautifulSoup(page.text, "lxml")


In [3]:
table = soup.find('table', id='rookies')

headers = [th.getText() for th in table.find_all('tr', limit=2)[1].find_all('th')]
#BBall Reference lists these average per game stats under the same headers as career totals, so we want to fix that for clarity
averages = ['MPG', 'PPG', 'RPG', 'APG']
headers[-4:] = averages
headers.pop(0)
print(headers)



['Player', 'Debut', 'Age', 'Yrs', 'G', 'MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG%', '3P%', 'FT%', 'MPG', 'PPG', 'RPG', 'APG']


In [4]:
#print(table)
rows = table.find_all('tr')[2:]
rows_data = [[td.getText() for td in rows[i].find_all('td')]
                    for i in range(len(rows))]




In [5]:
nba_rookies = pd.DataFrame(rows_data, columns=headers)
nba_rookies.index = np.arange(1, len(nba_rookies) + 1)


In [6]:
nba_rookies.drop([21,22,43,44], inplace=True)
nba_rookies.drop(columns=['Debut', 'Age'], inplace=True)

nba_rookies.to_csv("nba_rookies.csv", index=False)

In [7]:
df = pd.read_csv('nba_rookies.csv')

In [8]:
from sqlalchemy import create_engine

In [10]:
engine = create_engine('postgresql://root:root@localhost:5432/nba_rookies')

In [11]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x12346ea70>

In [12]:
print(pd.io.sql.get_schema(df, name='nba_rookie_data', con=engine))


CREATE TABLE nba_rookie_data (
	"Player" TEXT, 
	"Yrs" BIGINT, 
	"G" BIGINT, 
	"MP" BIGINT, 
	"FG" BIGINT, 
	"FGA" BIGINT, 
	"3P" BIGINT, 
	"3PA" BIGINT, 
	"FT" BIGINT, 
	"FTA" BIGINT, 
	"ORB" BIGINT, 
	"TRB" BIGINT, 
	"AST" BIGINT, 
	"STL" BIGINT, 
	"BLK" BIGINT, 
	"TOV" BIGINT, 
	"PF" BIGINT, 
	"PTS" BIGINT, 
	"FG%%" FLOAT(53), 
	"3P%%" FLOAT(53), 
	"FT%%" FLOAT(53), 
	"MPG" FLOAT(53), 
	"PPG" FLOAT(53), 
	"RPG" FLOAT(53), 
	"APG" FLOAT(53)
)




In [13]:
df.to_sql(con=engine, name='nba_rookie_data', if_exists='replace')

47