# IBDB Webscraping
This file webscrapes publically available data from the Broadway League's IBDB (Internet Broadway Database) for the purposes of analyzing the importance of Tony Awards outcomes for Broadway Productions.

This file borrows code in its first few chunks and takes general inspiration from the following [Colaboratory Jupyter Notebook](https://colab.research.google.com/drive/1IVwOhBMYay14NkO7kGkrPu0Ij9dSDdEP) by Yaakov Bressler.

In [1]:
import io
import requests
from lxml import etree
from bs4 import BeautifulSoup
import urllib.request
import re
import string
#import time
#import json   #might not need commented out ones
import datetime
import pandas as pd
import numpy as np
import urllib
import ast
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

### Create a function that grabs links from a page, using a tag to identify value of link

In [2]:
def getLinks_tagged_fast(url, tag):
    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html.parser')
    links = []
    # set the opening of each link to be...
    tag = tag
    for link in soup.findAll('a', attrs={'href': re.compile(tag)}):
        links.append(link.get('href'))
    return links

https://www.broadwayworld.com/browseshows.cfm?showtype=BR

The above link is your starting point. It will allow us to get the name of every Broadway production that opened between 1979 and now.

In [3]:
def get_show_links_year(year_url):
    """
    This function...
    """
    url = year_url
    tag_year = 'browseshows.cfm?'
    #calling previous function to get the link I want
    years = getLinks_tagged_fast(url, tag_year)[1:]
    page_base = 'https://www.broadwayworld.com/'
    years_loop =[]
    for year in years:
        #focusing on 1979 or later
        if year[-4:].isdigit() and int(year[-4:]) >= 1979:
            years_loop.append(page_base+year)
    
    # Now you have all the years
    tag_show = 'https://www.broadwayworld.com/shows/backstage.php?'
    show_links_nested = []
    for year in years_loop:
        show_links_nested.append(getLinks_tagged_fast(year,tag_show))
    show_links = sum(show_links_nested, [])
    
    return show_links
    
#running function to get my list of links to the productions I want
year_url = 'https://www.broadwayworld.com/browseshows.cfm?showtype=BR'
show_links = get_show_links_year(year_url)

In [4]:
#don't need every show ever, just 1979 onwards
#from here, you will the scrape name of each show, so you can search for it on ibdb.com using selenium
#print(show_links)
#the number of productions between 1979 and now fits the estimate I made in another assignment (roughly 1800)
len(show_links)

1877

In [5]:
def get_show_name(url):
    """
    This function...
    """
    response = requests.get(url)
    #make sure url actually exists
    assert response.status_code == 200

    show_html = response.text
    htmlparser = etree.HTMLParser()
    tree = etree.parse(io.StringIO(show_html), parser=htmlparser)
    showroot = tree.getroot()
    #this xml path is the same for every url, it will give you the name of the show for that url
    show_name = showroot.xpath("//span[@itemprop = 'name']/text()")[1]
    
    return show_name

In [6]:
#testing the function on a random url (tried this with several urls)
get_show_name('https://www.broadwayworld.com/shows/backstage.php?showid=6366')

'Oliver!'

In [7]:
#using get_show_name function to make a list of every single show I want to find data on
#took about 10 minutes to run this chunk
show_names = []
for link in show_links:
    show_name = get_show_name(link)
    show_names.append(show_name)
    
len(show_names)

1877

In [8]:
#due to the presence of revivals in the list, some show names appear more than once
#therefore, to reduce redundancy when I search for these shows on the database, I am only keeping unique show names
unique_show_names = []
for show in show_names:
    if show not in unique_show_names:
        unique_show_names.append(show)    

#print(unique_show_names)     
len(unique_show_names)

1590

In [84]:
#database will not accept searches with & or + in them, so I must replace these with 'and'
for i in range(len(unique_show_names)):
    unique_show_names[i] = unique_show_names[i].replace('+', 'and')
    unique_show_names[i] = unique_show_names[i].replace('&', 'and')

In [89]:
#unique_show_names

In [90]:
#the xpath statement that will get me the href for each production I am interested in
prod_xpath = '//div[@id = "nyc-productions" and @data-id = "nyc-productions"]/div/div/div/div/a'#/@href'

#testing functions from selenium package on three shows before doing every show

#this will be a list all of the urls that I need 
urls_list = []
#list of show name searches that did not produce needed results 
failed_searches = []

driver = webdriver.Chrome()
#test_shows = ['The King and I', 'Dear Evan Hansen', "Rodgers and Hammerstein's Cinderella",'Spring Awakening']
for show in unique_show_names:
    driver.get('https://www.ibdb.com/shows/')     #website I am searching from (IBDB)
    search_box = driver.find_element('name','ShowProperName')    #locating the searchbar
    search_box.send_keys(show)       #automating the searches (will store results in a list later)
    search_box.submit()
    #search_box.send_keys(Keys.ENTER)  #other way to do above line of code
    web_elts = driver.find_elements('xpath', prod_xpath)
    try:
        assert len(web_elts) > 0
    except:
        failed_searches.append(show)
    #print(len(web_elts))
    #getting href I need from each selenium WebElement
    results = [elt.get_attribute("href") for elt in web_elts]
    urls_list.extend(results)


In [96]:
#length is longer because several pre 1979 productions urls were scraped, these will be excluded by next big chunk
len(urls_list)

2104

In [94]:
#TO DO: run above code again to get correct failed searches, messed it up last time
#failed_searches

In [97]:
#Must do this for every single production I search for, just doing small handful now

#this LoD (poorly structured because of IBDB layout) will have weekly gross/capacity info
prod_data_list = []

#this LoD will have Tony nom/win info
tony_data_LoD = []

#this xpath give me the name of each award the production was nominated for
noms_path = "//div[@class = 'collapsible-body awards-tab']/div[position() = 1]/div/div/div/h4/text()"
#this xpath will give me the number of awards the production won
wins_path = "//div[@class = 'collapsible-body awards-tab']/div[position() = 1]/div[@class='col s1 right-align']/img[@src = '/Images/award.png']"
#this xpath will give me the year that the production was eligible for awards
#this path only works if the production received noms, for productions with 0 noms, I might have to manually put in the year
year_path = "//div[@class = 'collapsible-body awards-tab']/div[position() = 1]/div[@class = 'col s11']/div/div[@class = 'col s12' and position() = 2]/text()"


for url in urls_list:
    resp = requests.get(url)
    prod_html = resp.text
    soup = BeautifulSoup(prod_html, 'html.parser')
    #finding node that has javascript text with our data
    script = soup.find_all('script', type='text/javascript')[1]   #node at [1] is one with our data
    #making script into a string so I can easily parse through it
    js: str = script.text
        
    #need a 'try:' because the next few lines of code will not work for productions with no finanical data 
    #(i.e. pre 1979)
    try:
        #using a regex to search for the dict we are looking for (i.e. the one that has the data)
        raw_json = re.search('var grossdata = {0:\[.*\] };', js, flags=re.MULTILINE).group(0)
        #[16:-1] to exclude javascript syntax stuff
        data = ast.literal_eval(raw_json[16:-1])
        #adding key:value pair to dict to keep track of which production is which
        data['production'] = url[41:]
        prod_data_list.append(data)
        
        #scraping Tony info now
        #We must do this within the try because this allows us to skip Tony info for productions with no financial data
        #REACH GOAL: split awards into major/minor categories
        # ^ Based on the layout of the website, it would be tedious/difficult to do this
        tree = etree.HTML(str(soup)) 
        nominations = tree.xpath(noms_path)
        num_noms = len(nominations)
        num_wins = len(tree.xpath(wins_path))
        year = tree.xpath(year_path)[0]
        prod_award_dict = {'production': url[41:], 'nominations': num_noms, 'wins': num_wins, 'year': int(year[26:30])}
        tony_data_LoD.append(prod_award_dict)
    except:
        pass

In [98]:
tony_data_LoD

[{'production': 'dancin-4051', 'nominations': 7, 'wins': 2, 'year': 1978},
 {'production': 'dancin-535808', 'nominations': 1, 'wins': 0, 'year': 2023},
 {'production': 'camelot-13313', 'nominations': 2, 'wins': 0, 'year': 1981},
 {'production': 'camelot-534339', 'nominations': 5, 'wins': 0, 'year': 2023},
 {'production': 'fat-ham-535958', 'nominations': 5, 'wins': 0, 'year': 2023},
 {'production': 'good-night-oscar-535325',
  'nominations': 3,
  'wins': 1,
  'year': 2023},
 {'production': 'life-of-pi-535445',
  'nominations': 5,
  'wins': 3,
  'year': 2023},
 {'production': 'new-york-new-york-535492',
  'nominations': 9,
  'wins': 1,
  'year': 2023},
 {'production': 'pal-joey-477922', 'nominations': 4, 'wins': 0, 'year': 2009},
 {'production': 'peter-pan-goes-wrong-536073',
  'nominations': 3,
  'wins': 1,
  'year': 2023},
 {'production': 'prima-facie-535843',
  'nominations': 4,
  'wins': 1,
  'year': 2023},
 {'production': 'shucked-535831', 'nominations': 9, 'wins': 1, 'year': 2023},

In [99]:
prod_data_list[0]

{0: [['May 28, 2023',
   '$351,163',
   '-2147483648%',
   '7,757',
   '69%',
   'May 28',
   351163.0,
   0.0,
   7757.0,
   11184.0,
   '0',
   '8'],
  ['Jun 4, 2023',
   '$384,017',
   '-2147483648%',
   '8,562',
   '77%',
   'Jun 4',
   384017.0,
   0.0,
   8562.0,
   11184.0,
   '0',
   '8']],
 1: [['Feb 19, 2023',
   '$318,478',
   '-2147483648%',
   '2,796',
   '100%',
   'Feb 19',
   318478.0,
   0.0,
   2796.0,
   2796.0,
   '2',
   '0'],
  ['Feb 26, 2023',
   '$684,822',
   '-2147483648%',
   '9,107',
   '93%',
   'Feb 26',
   684822.0,
   0.0,
   9107.0,
   9786.0,
   '7',
   '0'],
  ['Mar 5, 2023',
   '$568,165',
   '-2147483648%',
   '8,695',
   '89%',
   'Mar 5',
   568165.0,
   0.0,
   8695.0,
   9786.0,
   '7',
   '0'],
  ['Mar 12, 2023',
   '$592,938',
   '-2147483648%',
   '8,663',
   '89%',
   'Mar 12',
   592938.0,
   0.0,
   8663.0,
   9786.0,
   '7',
   '0'],
  ['Mar 19, 2023',
   '$642,196',
   '-2147483648%',
   '8,752',
   '89%',
   'Mar 19',
   642196.0,
   0.

In [101]:
#TO DO: Make these comments (and others) into markdown chunks
#structure of prod_data_list (it is a poorly structured LoD):
#each dict is one production
#each key represent one season of data, with the exception of the key I added to represent the production
#the value for each season key is an LoL 
#each list in the LoL represents one week of data for that production
#in each list, we care about the vals at indeces: 0 (date of week), 4 (weekly capacity), 6 (weekly gross) 

#end result should be LoD of the following structure:
#[{'production': 'the-king-and-i-497593', 'date': 'May 29, 2016', 'capacity':'76%', 'gross': 546476.0},...]

prod_LoD = []
#prod_data_list[2][0][0]
for prod in prod_data_list:
    for season in prod:
        for week in prod[season]:
            if type(week) == list:
                #print(week)
                relevant_data = {'production': prod['production'], 'date': week[0], 'capacity': int(week[4][:-1]), 'gross': week[6]}
                prod_LoD.append(relevant_data)

#see what first few rows of data look like
prod_LoD[:5]

[{'production': 'bad-cinderella-535361',
  'date': 'May 28, 2023',
  'capacity': 69,
  'gross': 351163.0},
 {'production': 'bad-cinderella-535361',
  'date': 'Jun 4, 2023',
  'capacity': 77,
  'gross': 384017.0},
 {'production': 'bad-cinderella-535361',
  'date': 'Feb 19, 2023',
  'capacity': 100,
  'gross': 318478.0},
 {'production': 'bad-cinderella-535361',
  'date': 'Feb 26, 2023',
  'capacity': 93,
  'gross': 684822.0},
 {'production': 'bad-cinderella-535361',
  'date': 'Mar 5, 2023',
  'capacity': 89,
  'gross': 568165.0}]

In [49]:
#TO DO: DELETE THIS CHUNK

#this xpath give me the name of each award the production was nominated for
#noms_path = "//div[@class = 'collapsible-body awards-tab']/div[position() = 1]/div/div/div/h4/text()"
#this xpath will give me the number of awards the production won
#wins_path = "//div[@class = 'collapsible-body awards-tab']/div[position() = 1]/div[@class='col s1 right-align']/img[@src = '/Images/award.png']"
#tony_data_list = []

#for url in urls_list:
#test = 'https://www.ibdb.com/broadway-production/the-king-and-i-497593'
#resp = requests.get(test)
#prod_html = resp.text
#soup = BeautifulSoup(prod_html, 'html.parser')
#tree = etree.HTML(str(soup)) 
#year = tree.xpath("//div[@class = 'collapsible-body awards-tab']/div[position() = 1]/div[@class = 'col s11']/div/div[@class = 'col s12' and position() = 2]/text()")[0]
#nominations = tree.xpath(noms_path)
#num_noms = len(nominations)
#num_wins = len(tree.xpath(wins_path))
#prod_award_dict = {"production": url[41:], "nominations": num_noms, "wins": num_wins}
#tony_data_list.append(prod_award_dict)

In [76]:
#tony_data_list

In [39]:
#print(nominations)
#num_wins

In [102]:
tony_data = pd.DataFrame(tony_data_LoD)
tony_data = tony_data.set_index('production')
tony_data

Unnamed: 0_level_0,nominations,wins,year
production,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dancin-4051,7,2,1978
dancin-535808,1,0,2023
camelot-13313,2,0,1981
camelot-534339,5,0,2023
fat-ham-535958,5,0,2023
...,...,...,...
the-philadelphia-story-4077,1,1,1981
the-suicide-3970,1,0,1981
tintypes-3973,3,0,1981
sugar-babies-3812,8,0,1980


In [103]:
weekly_data = pd.DataFrame(prod_LoD)
weekly_data = weekly_data.set_index(['production','date'])
weekly_data

Unnamed: 0_level_0,Unnamed: 1_level_0,capacity,gross
production,date,Unnamed: 2_level_1,Unnamed: 3_level_1
bad-cinderella-535361,"May 28, 2023",69,351163.0
bad-cinderella-535361,"Jun 4, 2023",77,384017.0
bad-cinderella-535361,"Feb 19, 2023",100,318478.0
bad-cinderella-535361,"Feb 26, 2023",93,684822.0
bad-cinderella-535361,"Mar 5, 2023",89,568165.0
...,...,...,...
theyre-playing-our-song-3919,"May 3, 1981",87,177305.0
theyre-playing-our-song-3919,"May 10, 1981",75,162593.0
theyre-playing-our-song-3919,"May 17, 1981",83,179850.0
theyre-playing-our-song-3919,"May 24, 1981",83,172125.0


In [104]:
#save to data folder
tony_data.to_csv('data/tony_data.csv')
weekly_data.to_csv('data/weekly_data.csv')