#### Collection [Selenium]

Below is the Selenium code used to retreive the our listings
Here we are going to extract the basketball seasons from the [NBA Site](https://www.nba.com/stats/teams/boxscores-traditional)


In [None]:
# Importing all neccesary packages
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException as ECIE, NoSuchElementException as NSEE, StaleElementReferenceException as SERE, TimeoutException as TE, ElementNotInteractableException as ENIE

from openpyxl import Workbook, load_workbook
from buckets import dimes
import time
import re
import pandas as pd

##### Table Collection & Organization


In [None]:
# Webdriver: Chrome | Site: NBA site

driver = webdriver.Chrome()
url = "https://www.nba.com/stats/teams/boxscores-traditional"
driver.get(url)


In [None]:
# Create an Excel to begin data transfer
wb = Workbook()
wb.save('nba_box_scores.xlsx')
ws = wb.active
dfs = []

In [None]:
# Dictionary created from NBA Glossary
driver.execute_script("window.scrollTo(0,300)")

glossary_bttn = WebDriverWait(driver, 10).until(EC.element_to_be_clickable(driver.find_element(By.CLASS_NAME, "Crom_cromGlossary__ZHrZf")))
glossary_bttn.click() # View Glossary

abbr_list = {}

abbrs = driver.find_elements(By.CLASS_NAME, "StatsTableGlossary_dt__GPooh") # Abbr
defs = driver.find_elements(By.CLASS_NAME, "StatsTableGlossary_dd__zcr38") # defs

for idx, ab in enumerate(abbrs):
    abbr_list[ab.text] = defs[idx].text

glossary_bttn.click() # Close Glossary


In [None]:
# Capture all seasons in the list from web page
season_drpdwn = Select(driver.find_element(By.CLASS_NAME,"DropDown_select__4pIg9"))
seasons_arr = [sn.text for sn in season_drpdwn.options]
xpath_table = dimes.get('XPATH_TABLE')

In [None]:
# Ensure table is retrieved from 'Regular Season' Games
ssn_type_drpdwn = driver.find_element(By.XPATH,'//*[@id="__next"]/div[2]/div[2]/div[3]/section[1]/div/div/div[2]/label/div')
ssn_type = Select(driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/div[2]/div[3]/section[1]/div/div/div[2]/label/div/select'))
ssn_type.select_by_visible_text("Regular Season")


In [None]:
# Extact, Manipulate and Insert Seasons' table into  Excel

def assign_names(matches, teams, sn, df):
        
    print(f'Made it in for Season: {sn} with shape {df.shape}')

    # print(len(matches), '\n', len(teams), '\n', sn, '\n', df.shape) 
    team_names = []
    opp_names = []
    opp_abbrs = []

    # Assign names to team and opponent Series respectfully
    for i,match in enumerate(matches):

        team_abbr = re.split((r'vs. | @'),match)[0].rstrip()
        opp_abbr = re.split((r'vs. | @'),match)[1].lstrip()

        if team_abbr in teams:
            team_names.append(teams[team_abbr])
        if opp_abbr in teams:
            opp_names.append(teams[opp_abbr])
            opp_abbrs.append(opp_abbr)    

    df.insert(1, 'TEAM', team_names)
    df.insert(2, 'OPP ABBR', opp_abbrs)
    df.insert(3, 'OPPONENT', opp_names)
    df.insert(4, "SEASON", sn)
    
    team_names.clear()
    opp_abbrs.clear()
    opp_names.clear()

    
    return df


In [None]:
# Driver Code

xpath_pages = dimes.get('XPATH_PAGES')
teams = {}

for i, sn in enumerate(seasons_arr):


    try:

        ad = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME, 'bx-close-xsvg'))).click()
        cookie = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/div[2]/div/div[1]/div/div[2]/div/button'))).click()

        season_drpdwn = Select(driver.find_element(By.CLASS_NAME,"DropDown_select__4pIg9"))
        season_drpdwn.select_by_visible_text(sn)
        print(sn)

        page_drpdown = driver.find_element(By.CLASS_NAME, "Pagination_pageDropdown__KgjBU")
        time.sleep(2)

        pages = Select(driver.find_element(By.XPATH, xpath_pages))
        pages.select_by_visible_text("All")

    except:

        season_drpdwn = Select(driver.find_element(By.CLASS_NAME,"DropDown_select__4pIg9"))
        season_drpdwn.select_by_visible_text(sn)
        print(sn)

        page_drpdown = driver.find_element(By.CLASS_NAME, "Pagination_pageDropdown__KgjBU")
        time.sleep(2)

        pages = Select(driver.find_element(By.XPATH, xpath_pages))
        pages.select_by_visible_text("All")
        time.sleep(2)

    table = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, xpath_table))).get_attribute("outerHTML")
    sub_df = pd.read_html(table)[0]
    sub_df.dropna(how='all', axis=1, inplace=True)
    sub_df = sub_df.rename(columns={'TEAM': 'TEAM ABBR'})    
    all_teams = sub_df['TEAM ABBR'].unique()

    matches = sub_df['MATCH UP']
    
    print(f'Number of Teams this Season: {len(all_teams)}')
    # print(sub_df)

    if all(ele in teams for ele in all_teams): # Assign existing names to Abbreviations

        new_df = assign_names(matches, teams, sn, sub_df)
        dfs.append(new_df)

        # Append each Season's table into Excel file
        with pd.ExcelWriter(path='nba_box_scores.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            new_df.to_excel(writer, sheet_name= sn, index=False, header=True)
    
    else:
        # Get all names missing from the teams list
        # Handle all possible exceptions 
        
        missing_teams = [t for t in all_teams if t not in teams]
        print(missing_teams)
        for team in missing_teams:
        
            try:
                if team == 'ORL':
                    teams[team] = 'ORLANDO MAGIC'
                else:
                    linked_text = WebDriverWait(driver,2).until(EC.presence_of_element_located((By.LINK_TEXT, team)))
                    linked_text.click()
                    teams[team] = (driver.find_element(By.CLASS_NAME, 'TeamHeader_name__MmHlP').text).replace('\n','')
                    driver.execute_script("window.history.go(-1)") # go back a page
                    time.sleep(2)
                print(f'{team} Cleared!')
        

            except ECIE: # Overlay may be blocking focused elements
                print('Entered ECIE: Element Click Intercepted Exception')
                print(f'Nope!! Trying to capture {team} again')
                ad = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME, 'bx-close-xsvg'))).click()
                cookie = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/div[2]/div/div[1]/div/div[2]/div/button'))).click()


                driver.find_element(By.LINK_TEXT, team).click()
                teams[team] = (driver.find_element(By.CLASS_NAME, 'TeamHeader_name__MmHlP').text).replace('\n','')
                driver.execute_script("window.history.go(-1)") # go back a page
                time.sleep(2)
                print(f'{team} Cleared!')

            except NSEE: # Element doesn't exist. 
                print('Entered NSSE: No Such Element Exception')
                time.sleep(1)
                ad = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CLASS_NAME, 'bx-close-xsvg'))).click()

                page_drpdown = driver.find_element(By.CLASS_NAME, "Pagination_pageDropdown__KgjBU")
                time.sleep(1.5)

                pages = Select(driver.find_element(By.XPATH, xpath_pages))
                pages.select_by_visible_text("All")

                driver.find_element(By.LINK_TEXT, team).click()
                teams[team] = (driver.find_element(By.CLASS_NAME, 'TeamHeader_name__MmHlP').text).replace('\n','')
                driver.execute_script("window.history.go(-1)") # go back a page
                time.sleep(2)
                print(f'{team} Cleared!')
            
            except TE: 
                print('Entered TE: Timeout Exception')
                ad = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CLASS_NAME, 'bx-close-xsvg'))).click()

                print(f'Trying to capture {team} again')

                page_drpdown = driver.find_element(By.CLASS_NAME, "Pagination_pageDropdown__KgjBU")
                time.sleep(1.5)

                pages = Select(driver.find_element(By.XPATH, xpath_pages))
                pages.select_by_visible_text("All")

                driver.find_element(By.LINK_TEXT, team).click()
                teams[team] = (driver.find_element(By.CLASS_NAME, 'TeamHeader_name__MmHlP').text).replace('\n','')
                driver.execute_script("window.history.go(-1)") # go back a page
                time.sleep(2)
                print(f'{team} Cleared!')
                continue

            except SERE:
                print('Entered SERE: Stale Element Reference Exception')

                page_drpdown = driver.find_element(By.CLASS_NAME, "Pagination_pageDropdown__KgjBU")
                time.sleep(2)

                pages = Select(driver.find_element(By.XPATH, xpath_pages))
                pages.select_by_visible_text("All")

                driver.find_element(By.LINK_TEXT, team).click()
                teams[team] = (driver.find_element(By.CLASS_NAME, 'TeamHeader_name__MmHlP').text).replace('\n','')
                driver.execute_script("window.history.go(-1)") # go back a page
                time.sleep(2)
                print(f'{team} Cleared!')

            except ENIE:
                print('Entered ENIE: Element Not Interactable Exception')
                ad = WebDriverWait(driver,20).until(EC.presence_of_element_located((By.CLASS_NAME, 'bx-close-xsvg'))).click()

                page_drpdown = driver.find_element(By.CLASS_NAME, "Pagination_pageDropdown__KgjBU")
                time.sleep(1.5)

                pages = Select(driver.find_element(By.XPATH, xpath_pages))
                pages.select_by_visible_text("All")

                driver.find_element(By.LINK_TEXT, team).click()
                teams[team] = (driver.find_element(By.CLASS_NAME, 'TeamHeader_name__MmHlP').text).replace('\n','')
                driver.execute_script("window.history.go(-1)") # go back a page
                time.sleep(2)
                print(f'{team} Cleared!')

        new_df = assign_names(matches, teams, sn, sub_df)
        dfs.append(new_df)

        with pd.ExcelWriter(path='nba_box_scores.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            new_df.to_excel(writer, sheet_name= sn, index=False, header=True)
        


In [None]:
# Combine all tables for DataFrame
df = pd.concat([x for x in dfs], ignore_index=True)

##### REGION url


In [None]:
# Selenium part

url2 = 'https://www.nba.com/stats/teams'
driver.get(url2)

region_list = driver.find_elements(By.CLASS_NAME, 'StatsTeamsList_division__ZUezr')

In [None]:
# Assigning to REGION df
group = []

# Region: Atlantic, Central, Southeast, Northwest, Pacific, Southwest
for region in region_list:
    rgn = region.text.split('\n')
    group.append(rgn)

region_headers = [rh.pop(0) for rh in group]
group = np.array(group).T # Transpose
group = np.char.upper(group) # Turn all into CAPS

region_df = pd.DataFrame(
    data = group,
    columns=region_headers
)

region_df.to_csv('nba_regions.csv', index=False)

##### CONFERENCE url


In [None]:
# Selenium part

url3 = 'https://www.nba.com/standings'
driver.get(url3)
confs = driver.find_elements(By.CLASS_NAME, 'Crom_base__f0niE')
division = driver.find_elements(By.CLASS_NAME, 'Crom_caption__Yv_rH')

In [None]:
# Assigning to CONF. df
dvsn = [] 
conf_groups = []

for conf in confs: # East | West Confs
    dvsn.append(re.split(r'\n',conf.text)[0].upper())
    lists = re.split(r'\n',conf.text)
    c_names = []

    for idx,each in enumerate(lists):
    
        if re.search(r'^\d{1,2}$',each) != None:
            joint_tn = ''.join([lists[idx+1], lists[idx+2]]).upper()
            # print(joint_tn)
            c_names.append(joint_tn)

    conf_groups.append(c_names)

conf_groups = np.array(conf_groups).T



conf_df = pd.DataFrame(
    data=conf_groups,
    columns= dvsn
)

conf_df.to_csv('nba_standing.csv', index=False)

In [None]:
# Append Region and Conference to established DataFrame

sides = [] # Conference list
areas = [] # Region list

for each in df['TEAM']:   

	# Handle Conference Column
	for col in conf_df:
		if each in conf_df[col].values:
			sides.append(col.split(" ")[0]) # Apply 'EASTERN' | 'WESTERN'

	# Handle Region Column
	for col in region_df:
		if each in region_df[col].values:
			areas.append(col)


df['CONFERENCE'] = sides

# Special Case: Filters the sub df with NOP  in SEASON < 2004-05 and mark it EASTERN CONFERENCE  
# Since this created 2 seperate conference for NOP, it would make the Conferences unbalanced; Keep as WESTERN
# df.loc[(df['SEASON'] < '2004-05') & (df['TEAM'] == 'NEW ORLEANS PELICANS'), 'CONFERENCE'] = 'EASTERN'


for each in df['TEAM']:
	for col in region_df:
		if each in region_df[col].values:
			areas.append(col)




df['REGION'] = areas

In [None]:
# Export df to Files

with pd.ExcelWriter(path='nba_box_scores.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    df.to_excel(writer, sheet_name= 'Sheet', index=False, header=True)

df.to_csv('nba_box_scores.csv', index=False)

In [None]:

# Close all opened files and systems
wb.close()
driver.close()