# Imports

In [0]:
#imports required to get selenium to work on Colabs
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')


In [0]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# Selenium
Use selenium to to get page source. Cannot use requests as it is blocked for that method.

In [3]:
# a driver for per_game and advanced stats
driver_per = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
driver_adv = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

  """Entry point for launching an IPython kernel.
  


In [0]:
# Use selenium to to get source for page 
# cannot use requests as it is blocked for that method
# array of html pages sources from Basketball-Reference
per_game_source = []
advanced_source = []
# start and end dates for required years
start_year = 2018
end_year = 2020
years = list(range(start_year,end_year+1))
for i in years:
  driver_per.get('https://www.basketball-reference.com/leagues/NBA_%s_per_game.html' % (i)) 
  driver_adv.get('https://www.basketball-reference.com/leagues/NBA_%s_advanced.html' % (i))
  per_game_source.append(driver_per.page_source)
  advanced_source.append(driver_adv.page_source)
driver_per.close()
driver_adv.close()


# Beautiful Soup
Use bs to sort through the HTML as it is a faster interface to that of selenium 

In [0]:
from bs4 import BeautifulSoup
import pandas as pd

In [0]:
# returns a pandas DataFrame with all players data from a page source
def nba_parse_data(page_source):
  soup = BeautifulSoup(page_source, 'html.parser')
  print(soup.title)
  table = soup.find_all("table")[0]
  tbody = table.find('tbody')
  tr = tbody.find_all('tr')
  dataset = []
  for player in tr:
    data1 = []
    for col in player.find_all('td'):
      data1.append(col.text)
    dataset.append(data1)
  return pd.DataFrame(data=dataset)

In [7]:
# creates the tables for each NBA Player Per Game Stats table
data_per = [] 
for season in per_game_source:
  df = nba_parse_data(season)
  df.columns = ['PLAYER',	'POS'	,'AGE',	'TEAM',	'G'	,'GS',	'MP',	'FG'	
                ,'FGA','FG%',	'3P'	,'3PA',	'3P%',	'2P',	'2PA'	,'2P%','eFG%'
                ,	'FT',	'FTA',	'FT%',	'ORB',	'DRB',	'TRB',	'AST',	'STL'
                ,	'BLK',	'TOV'	,'PF'	,'PTS']
  df = df.drop_duplicates()
  df = df.drop('MP',axis=1)
  data_per.append(df)

# creates the tables for each NBA Player Per Game Advanced Stats table
data_adv = [] 
for season in advanced_source:
  df = nba_parse_data(season)
  df.columns = ['PLAYER',	'POS'	,'AGE',	'TEAM',	'G',	'MP',	'PER',	'TS%'
  ,'3PAr','FTr',	'ORB%',	'DRB%',	'TRB%',	'AST%',	'STL%',	'BLK%',	'TOV%',	'USG%'
  ,	'blank1','OWS',	'DWS',	'WS',	'WS/48'	,'blank2','OBPM',	'DBPM',	'BPM',	'VORP']
  df = df.drop_duplicates()
  df = df.drop(['blank1','blank2'],axis=1)
  data_adv.append(df)

<title>2017-18 NBA Player Stats: Per Game | Basketball-Reference.com</title>
<title>2018-19 NBA Player Stats: Per Game | Basketball-Reference.com</title>
<title>2019-20 NBA Player Stats: Per Game | Basketball-Reference.com</title>
<title>2017-18 NBA Player Stats: Advanced | Basketball-Reference.com</title>
<title>2018-19 NBA Player Stats: Advanced | Basketball-Reference.com</title>
<title>2019-20 NBA Player Stats: Advanced | Basketball-Reference.com</title>


In [0]:
# Merges the Per Game and Advanced stats for each year
frames = []
for i in range(len(years)):
  adv = data_adv[i]
  per = data_per[i]
  table = pd.merge(adv,per,on=['PLAYER',	'POS',	'AGE',	'TEAM','G'])
  # insert a YEAR column
  table.insert(loc=1,column='YEAR',value=years[i])
  frames.append(table)

In [0]:
data = pd.concat(frames)

In [22]:
data

Unnamed: 0,PLAYER,YEAR,POS,AGE,TEAM,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,GS,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Álex Abrines,2018,SG,24,OKC,75,1134,9.0,.567,.759,.158,2.5,8.9,5.6,3.4,1.7,0.6,7.4,12.7,1.3,1.0,2.2,.094,-1.9,0.4,-1.5,0.1,8,1.5,3.9,.395,1.1,2.9,.380,0.4,0.9,.443,.540,0.5,0.6,.848,0.3,1.2,1.5,0.4,0.5,0.1,0.3,1.7,4.7
1,Quincy Acy,2018,PF,27,BRK,70,1359,8.2,.525,.800,.164,3.1,17.1,10.0,6.0,1.2,1.6,13.3,14.4,-0.1,1.1,1.0,.036,-2.6,0.1,-2.5,-0.2,8,1.9,5.2,.356,1.5,4.2,.349,0.4,1.0,.384,.496,0.7,0.9,.817,0.6,3.1,3.7,0.8,0.5,0.4,0.9,2.1,5.9
2,Steven Adams,2018,C,24,OKC,76,2487,20.6,.630,.003,.402,16.6,13.9,15.3,5.5,1.8,2.8,13.3,16.7,6.7,3.0,9.7,.187,1.7,-0.6,1.1,2.0,76,5.9,9.4,.629,0.0,0.0,.000,5.9,9.3,.631,.629,2.1,3.8,.559,5.1,4.0,9.0,1.2,1.2,1.0,1.7,2.8,13.9
3,Bam Adebayo,2018,C,20,MIA,69,1368,15.7,.570,.021,.526,9.7,21.6,15.6,11.0,1.2,2.5,13.6,15.9,2.3,1.9,4.2,.148,-1.1,0.7,-0.4,0.6,19,2.5,4.9,.512,0.0,0.1,.000,2.5,4.8,.523,.512,1.9,2.6,.721,1.7,3.8,5.5,1.5,0.5,0.6,1.0,2.0,6.9
4,Arron Afflalo,2018,SG,32,ORL,53,682,5.8,.516,.432,.160,0.6,10.1,5.3,6.2,0.3,1.1,10.8,12.5,-0.1,0.2,0.1,.009,-3.8,-1.5,-5.4,-0.6,3,1.2,3.1,.401,0.5,1.3,.386,0.7,1.7,.413,.485,0.4,0.5,.846,0.1,1.2,1.2,0.6,0.1,0.2,0.4,1.1,3.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,Thaddeus Young,2020,PF,31,CHI,64,1591,13.3,.521,.369,.120,6.2,16.1,10.9,11.3,2.8,1.4,13.9,19.7,-0.2,2.1,1.9,.058,-1.4,0.2,-1.1,0.4,16,4.2,9.4,.448,1.2,3.5,.356,3.0,5.9,.501,.513,0.7,1.1,.583,1.5,3.5,4.9,1.8,1.4,0.4,1.6,2.1,10.3
621,Trae Young,2020,PG,21,ATL,60,2120,23.9,.595,.455,.448,1.6,11.5,6.5,45.6,1.4,0.3,16.2,34.9,5.4,0.5,5.9,.134,6.3,-2.3,4.0,3.2,60,9.1,20.8,.437,3.4,9.5,.361,5.7,11.4,.501,.519,8.0,9.3,.860,0.5,3.7,4.3,9.3,1.1,0.1,4.8,1.7,29.6
622,Cody Zeller,2020,C,27,CHO,58,1341,18.8,.576,.157,.374,12.6,21.2,16.7,11.3,1.5,1.7,11.9,20.8,2.4,1.2,3.6,.129,0.2,-0.8,-0.6,0.5,39,4.3,8.3,.524,0.3,1.3,.240,4.0,7.0,.577,.543,2.1,3.1,.682,2.8,4.3,7.1,1.5,0.7,0.4,1.3,2.4,11.1
623,Ante Žižić,2020,C,23,CLE,22,221,16.4,.597,.000,.264,9.0,24.4,16.6,4.2,1.5,1.9,11.1,17.5,0.3,0.2,0.5,.106,-1.7,-1.5,-3.2,-0.1,0,1.9,3.3,.569,0.0,0.0,,1.9,3.3,.569,.569,0.6,0.9,.737,0.8,2.2,3.0,0.3,0.3,0.2,0.5,1.2,4.4


In [0]:
# #save DATA to csv
data.to_csv('season_stats_2018-2020.csv')
# !cp vote_mvp.csv "drive/My Drive/"