# Arena Scraper

## Import Deps

In [1]:
%config IPCompleter.greedy=True

# https://docs.python.org/3/library/datetime.html
import datetime as dt

# https://docs.python.org/3/howto/regex.html
import re

# https://beautiful-soup-4.readthedocs.io/en/latest/
from bs4 import BeautifulSoup as bs

# https://splinter.readthedocs.io/en/latest/#
from splinter import Browser

## Using Splinter to Download HTML

But first you want to check when the last time the splinter code was last ran.

In [2]:
import pathlib
from os import scandir


def convert_date(timestamp):
    d = dt.datetime.utcfromtimestamp(timestamp)
    formated_date = d.strftime("%d %b %Y")
    return formated_date


file = pathlib.Path("arena.html")
if file.exists():
    info = file.stat()
    print(f"File already exists and last updated on {convert_date(info.st_mtime)}")
else:
    # Using Splinter to automate the browser actions
    # Initiate headless driver for deployment
    with Browser("chrome", executable_path="chromedriver", headless=True) as browser:
        # Visit URL
        url = "https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_arenas"
        browser.visit(url)

        list_of_arena_a_tags = browser.find_by_css("a[class='image']")
        print(list_of_arena_a_tags.first.value)

        # I know the current URL I've passed to visit
        # is the site that holds all the logos I am looking for
        # so I just save the site and start working with bs4

        # Scrape page into soup
        html = browser.html

        # Save HTML to local machine so we don't continue to requests
        with open("arena.html", "w", encoding="utf8") as f:
            f.write(html)

# browser = Browser("chrome", executable_path="chromedriver", headless=True)




In [3]:
# current HTMl file
# opened_html = ""

# Read in local HTML page
with open("arena.html", "r", encoding="UTF-8", errors="strict") as f:
    opened_html = f.read()

    # Scrape page into soup
    soup = bs(opened_html, "html.parser")

    # Print out the html structure to plan out the scraping logic
    # print(soup.prettify())

    # I want to find all the table tags and filter down to a specfic table with arena data
    list_of_table_tags = soup.find_all("table")
    wanted_table = list_of_table_tags[0]
    list_of_arena_images = wanted_table.find_all("a", {"class": "image"})

    clean_url_list_bs4 = []
    for i in list_of_arena_images:
        temp = i.get("href")
        clean_url_list_bs4.append(temp)

## Using Pandas

In [1]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_html.html
import pandas as pd

areans_df = pd.read_html(
    "https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_arenas",
    match="Arena",
)
print(len(areans_df))

10


In [4]:
print(areans_df)

[    Image                       Arena                    Location  \
0     NaN             Amalie Arena[a]              Tampa, Florida   
1     NaN     American Airlines Arena              Miami, Florida   
2     NaN    American Airlines Center               Dallas, Texas   
3     NaN                Amway Center            Orlando, Florida   
4     NaN                 AT&T Center          San Antonio, Texas   
5     NaN                  Ball Arena            Denver, Colorado   
6     NaN     Bankers Life Fieldhouse       Indianapolis, Indiana   
7     NaN             Barclays Center          Brooklyn, New York   
8     NaN           Capital One Arena            Washington, D.C.   
9     NaN                Chase Center   San Francisco, California   
10    NaN     Chesapeake Energy Arena     Oklahoma City, Oklahoma   
11    NaN                  FedExForum          Memphis, Tennessee   
12    NaN                Fiserv Forum        Milwaukee, Wisconsin   
13    NaN             Golden 1 Ce

In [9]:
import time

# Initiate headless driver for deployment
browser = Browser("chrome", executable_path="chromedriver", headless=True)

# Visit URL
url = "https://en.wikipedia.org/wiki/List_of_National_Basketball_Association_arenas"
browser.visit(url)

list_of_arena_a_tags = browser.find_by_css("a[class='image']")

original_url_list_splinter = []
for i in list_of_arena_a_tags:
    original_url_list_splinter.append(i["href"])

clean_url_list_splinter = []
for x in original_url_list_splinter:
    # Find the Original file link and click that
    browser.visit(x)
    browser.is_element_present_by_text("Original file", wait_time=5)
    more_info_elem = browser.find_by_css("a[class='internal']")
    clean_url_list_splinter.append(more_info_elem["href"])

# Stop webdriver and return data
browser.quit()

print(clean_url_list_splinter)

['https://upload.wikimedia.org/wikipedia/commons/5/58/Chase_Center_-_East_Side_-_San_Francisco.jpg', 'https://upload.wikimedia.org/wikipedia/commons/2/2a/St_Pete_Times_Forum_At_Sunset.jpg', 'https://upload.wikimedia.org/wikipedia/commons/4/44/American_Airlines_Arena%2C_Miami%2C_FL%2C_jjron_29.03.2012.jpg', 'https://upload.wikimedia.org/wikipedia/commons/b/bc/American_Airlines_Center_August_2015.jpg', 'https://upload.wikimedia.org/wikipedia/commons/9/9f/Amway_Center.jpg', 'https://upload.wikimedia.org/wikipedia/commons/5/50/Texasdd.JPG', 'https://upload.wikimedia.org/wikipedia/commons/d/d4/Denver_Pepsi_Center_1.jpg', 'https://upload.wikimedia.org/wikipedia/commons/6/67/Bankers_Life_Fieldhouse%2C_Indian%C3%A1polis%2C_Estados_Unidos%2C_2012-10-22%2C_DD_02.jpg', 'https://upload.wikimedia.org/wikipedia/commons/5/55/Barclays_Center_western_side.jpg', 'https://upload.wikimedia.org/wikipedia/commons/7/7f/Verizon_Center_wide.jpg', 'https://upload.wikimedia.org/wikipedia/commons/5/58/Chase_Cente

## Download Images with Request

In [14]:
## Importing Necessary Modules
# https://requests.readthedocs.io/en/master/
# https://docs.python.org/3/library/shutil.html#module-shutil
import shutil  # to save it locally

import requests  # to get image from the web

counter = 1
# Looping through the img_url_list and downloading
for i in clean_url_list_splinter:
    counter = counter + 1
    response = requests.get(i, stream=True)
    if response.status_code == 200:
        response.raw.decode_content = True

        with open(f"{str(counter)}.jpg", "wb") as f:
            shutil.copyfileobj(response.raw, f)
        
        print(f"Image sucessfully Downloaded: {str(counter)}")
    
    else:
        print("Image Couldn't be retreived")


Image sucessfully Downloaded: 2
Image sucessfully Downloaded: 3
Image sucessfully Downloaded: 4
Image sucessfully Downloaded: 5
Image sucessfully Downloaded: 6
Image sucessfully Downloaded: 7
Image sucessfully Downloaded: 8
Image sucessfully Downloaded: 9
Image sucessfully Downloaded: 10
Image sucessfully Downloaded: 11
Image sucessfully Downloaded: 12
Image sucessfully Downloaded: 13
Image sucessfully Downloaded: 14
Image sucessfully Downloaded: 15
Image Couldn't be retreived
Image Couldn't be retreived
Image Couldn't be retreived
Image Couldn't be retreived
Image sucessfully Downloaded: 20
Image Couldn't be retreived
Image Couldn't be retreived
Image Couldn't be retreived
Image sucessfully Downloaded: 24
Image sucessfully Downloaded: 25
Image sucessfully Downloaded: 26
Image Couldn't be retreived
Image Couldn't be retreived
Image sucessfully Downloaded: 29
Image Couldn't be retreived
Image Couldn't be retreived
Image sucessfully Downloaded: 32
Image sucessfully Downloaded: 33


[]
