# IBDB Webscraping
This file webscrapes publically available data from the Broadway League's IBDB (Internet Broadway Database) for the purposes of analyzing the importance of Tony Awards outcomes for Broadway Productions.

This file borrows code in its first few chunks and takes general inspiration from the following [Colaboratory Jupyter Notebook](https://colab.research.google.com/drive/1IVwOhBMYay14NkO7kGkrPu0Ij9dSDdEP) by Yaakov Bressler.

In [1]:
import io
import requests
from lxml import etree
from bs4 import BeautifulSoup
import urllib.request
import re
import string
#import time
#import json   #might not need commented out ones
import datetime
import pandas as pd
import numpy as np
import urllib
import ast
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

### Create a function that grabs links from a page, using a tag to identify value of link

In [2]:
def getLinks_tagged_fast(url, tag):
    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html.parser')
    links = []
    # set the opening of each link to be...
    tag = tag
    for link in soup.findAll('a', attrs={'href': re.compile(tag)}):
        links.append(link.get('href'))
    return links

https://www.broadwayworld.com/browseshows.cfm?showtype=BR

The above link is your starting point. It will allow us to get the name of every Broadway production that opened between 1979 and now.

In [3]:
def get_show_links_year(year_url):
    """
    This function...
    """
    url = year_url
    tag_year = 'browseshows.cfm?'
    #calling previous function to get the link I want
    years = getLinks_tagged_fast(url, tag_year)[1:]
    page_base = 'https://www.broadwayworld.com/'
    years_loop =[]
    for year in years:
        #focusing on 1979 or later
        if year[-4:].isdigit() and int(year[-4:]) >= 1979:
            years_loop.append(page_base+year)
    
    # Now you have all the years
    tag_show = 'https://www.broadwayworld.com/shows/backstage.php?'
    show_links_nested = []
    for year in years_loop:
        show_links_nested.append(getLinks_tagged_fast(year,tag_show))
    show_links = sum(show_links_nested, [])
    
    return show_links
    
#running function to get my list of links to the productions I want
year_url = 'https://www.broadwayworld.com/browseshows.cfm?showtype=BR'
show_links = get_show_links_year(year_url)

In [4]:
#don't need every show ever, just 1979 onwards
#from here, you will the scrape name of each show, so you can search for it on ibdb.com using selenium
#print(show_links)
#the number of productions between 1979 and now fits the estimate I made in another assignment (roughly 1800)
len(show_links)

1877

In [5]:
def get_show_name(url):
    """
    This function...
    """
    response = requests.get(url)
    #make sure url actually exists
    assert response.status_code == 200

    show_html = response.text
    htmlparser = etree.HTMLParser()
    tree = etree.parse(io.StringIO(show_html), parser=htmlparser)
    showroot = tree.getroot()
    #this xml path is the same for every url, it will give you the name of the show for that url
    show_name = showroot.xpath("//span[@itemprop = 'name']/text()")[1]
    
    return show_name

In [6]:
#testing the function on a random url (tried this with several urls)
get_show_name('https://www.broadwayworld.com/shows/backstage.php?showid=6366')

'Oliver!'

In [7]:
#using get_show_name function to make a list of every single show I want to find data on
#took about 10 minutes to run this chunk
show_names = []
for link in show_links:
    show_name = get_show_name(link)
    show_names.append(show_name)
    
len(show_names)

1877

In [14]:
#due to the presence of revivals in the list, some show names appear more than once
#therefore, to reduce redundancy when I search for these shows on the database, I am only keeping unique show names
unique_show_names = []
for show in show_names:
    if show not in unique_show_names:
        unique_show_names.append(show)

#print(unique_show_names)     
len(unique_show_names)

1590

In [12]:
#show_names

In [7]:
#the xpath statement that will get me the href for each production I am interested in
prod_xpath = '//div[@id = "nyc-productions" and @data-id = "nyc-productions"]/div/div/div/div/a'#/@href'

#testing functions from selenium package on three shows before doing every show

#this will be a list all of the urls that I need 
urls_list = []
driver = webdriver.Chrome()
productions = ['The King and I', 'Dear Evan Hansen', 'Spring Awakening']
for prod in productions:
    driver.get('https://www.ibdb.com/shows/')     #website I am searching from (IBDB)
    search_box = driver.find_element('name','ShowProperName')    #locating the searchbar
    search_box.send_keys(prod)       #automating the searches (will store results in a list later)
    search_box.submit()
    #search_box.send_keys(Keys.ENTER)  #other way to do above line of code
    web_elts = driver.find_elements('xpath', prod_xpath)
    #getting href I need from each selenium WebElement
    results = [elt.get_attribute("href") for elt in web_elts]
    urls_list.extend(results)


In [18]:
urls_list#[0][41:]

['https://www.ibdb.com/broadway-production/the-king-and-i-1935',
 'https://www.ibdb.com/broadway-production/the-king-and-i-3999',
 'https://www.ibdb.com/broadway-production/the-king-and-i-4357',
 'https://www.ibdb.com/broadway-production/the-king-and-i-4673',
 'https://www.ibdb.com/broadway-production/the-king-and-i-497593',
 'https://www.ibdb.com/broadway-production/dear-evan-hansen-508238',
 'https://www.ibdb.com/broadway-production/spring-awakening-448811',
 'https://www.ibdb.com/broadway-production/spring-awakening-501403']

In [16]:
#Must do this for every single production I search for, just doing small handful now
#TO DO: these dicts do not have Tony nom/win data, that is somewhere else
# ^ for that, use <div class="col s12">, but that path includes Drama Desk too?

#trying this with BS instead of xpath
#could not get it to work with xpath 
prod_data_list = []
for url in urls_list:
    resp = requests.get(url)
    prod_html = resp.text
    soup = BeautifulSoup(prod_html, 'html.parser')
    #finding node that has javascript text with our data
    script = soup.find_all('script', type='text/javascript')[1]   #node at [1] is one with our data
    #making script into a string so I can easily parse through it
    js: str = script.text
        
    #need a 'try:' because the next few lines of code will not work for productions with no finanical data 
    #(i.e. pre 1979)
    try:
        #using a regex to search for the dict we are looking for (i.e. the one that has the data)
        raw_json = re.search('var grossdata = {0:\[.*\] };', js, flags=re.MULTILINE).group(0)
        #[16:-1] to exclude javascript syntax stuff
        data = ast.literal_eval(raw_json[16:-1])
        #adding key:value pair to dict to keep track of which production is which
        data['production'] = url[41:]
        prod_data_list.append(data)
    except:
        pass

In [17]:
#NOTE: 2nd __% in each list is the one you want for capacity. The first one represents something else
#from this, you want date, gross (none string one), capacity
prod_data_list

[{0: [['Jun 9, 1985',
    '$407,169',
    '76%',
    '9,969',
    '71%',
    'Jun 9',
    407169.0,
    534619.0,
    9969.0,
    14120.0,
    '0',
    '8'],
   ['Jun 16, 1985',
    '$177,896',
    '67%',
    '4,644',
    '66%',
    'Jun 16',
    177896.0,
    267309.5,
    4644.0,
    7060.0,
    '0',
    '4'],
   ['Jun 23, 1985',
    '$318,174',
    '95%',
    '8,025',
    '91%',
    'Jun 23',
    318174.0,
    334136.87,
    8025.0,
    8825.0,
    '0',
    '5'],
   ['Jun 30, 1985',
    '$605,546',
    '113%',
    '13,833',
    '98%',
    'Jun 30',
    605546.0,
    534619.0,
    13833.0,
    14120.0,
    '0',
    '8']],
  1: [['Dec 30, 1984',
    '$482,280',
    '90%',
    '11,832',
    '84%',
    'Dec 30',
    482280.0,
    534619.0,
    11832.0,
    14120.0,
    '8',
    '0'],
   ['Jan 6, 1985',
    '$502,566',
    '94%',
    '13,255',
    '94%',
    'Jan 6',
    502566.0,
    534619.0,
    13255.0,
    14120.0,
    '8',
    '0'],
   ['Jan 13, 1985',
    '$465,216',
    '87%',
  

In [52]:
#TO DO: scrape award info
#this will eventually be in same chunk as previous scraping
#major/minor distinctino might be tough to do

#tony_data_list = []
#for url in urls_list:
test = 'https://www.ibdb.com/broadway-production/the-king-and-i-497593'
resp = requests.get(test)
prod_html = resp.text
soup = BeautifulSoup(prod_html, 'html.parser')
tree = etree.HTML(str(soup)) 
awards = tree.xpath("//div[@class = 'col s11']/div/div/h4/text()")
#awards_results = tree.xpath("//div[@class = 'col s11']/div/div[position() = 2]/text()")
awards_results = tree.xpath("//img[@src = '/Images/award.png']")


#might be easier here with xpath
#tree = etree.parse(io.StringIO(prod_html), parser=htmlparser)
#prod_root = tree.getroot()
#this xpath statment...
#awards = prod_root.xpath("//div[@class = 'col s11']/div/div/h4/text()")

In [54]:
#do Tony cats always start with "Best" and Drama Desk with "Outstanding"? Could help you filter if true...
#TO DO: filter out Drama Desk Awards 
#TO DO: figure out how to not get all results twice (make xpath more specific?)
print(awards)

#length will say 10, but thid prod won 4 Tonys and 1 Drama Desk, must stop doubling of results and filter our DD
len(awards_wins)

['Best Revival of a Musical', 'Best Performance by an Actor in a Leading Role in a Musical', 'Best Performance by an Actress in a Leading Role in a Musical', 'Best Performance by an Actress in a Featured Role in a Musical', 'Best Choreography', 'Best Direction of a Musical', 'Best Scenic Design of a Musical', 'Best Costume Design of a Musical', 'Best Lighting Design of a Musical', 'Outstanding Revival of a Musical or Revue', 'Outstanding Sound Design in a Musical', 'Best Revival of a Musical', 'Best Performance by an Actor in a Leading Role in a Musical', 'Best Performance by an Actress in a Leading Role in a Musical', 'Best Performance by an Actress in a Featured Role in a Musical', 'Best Choreography', 'Best Direction of a Musical', 'Best Scenic Design of a Musical', 'Best Costume Design of a Musical', 'Best Lighting Design of a Musical', 'Outstanding Revival of a Musical or Revue', 'Outstanding Sound Design in a Musical']


10

In [10]:
#need to get "/broadway-production/<name-of-show-number>" for every production
#href is important
#URL = "https://www.ibdb.com/shows/" 