# IBDB Webscraping
This file webscrapes publically available data from the Broadway League's IBDB (Internet Broadway Database) for the purposes of analyzing the importance of Tony Awards outcomes for Broadway Productions.

This file borrows code and takes general inspiration from the following [Colaboratory Jupyter Notebook](https://colab.research.google.com/drive/1IVwOhBMYay14NkO7kGkrPu0Ij9dSDdEP) by Yaakov Bressler

In [38]:
import io
import requests
from lxml import etree
from bs4 import BeautifulSoup
import urllib.request
import re
import string
#import time
import datetime
import pandas as pd
import numpy as np
import urllib
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

### Create a function that grabs links from a page, using a tag to identify value of link

In [2]:
def getLinks_tagged_fast(url, tag):
    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html.parser')
    links = []
    # set the opening of each link to be...
    tag = tag
    for link in soup.findAll('a', attrs={'href': re.compile(tag)}):
        links.append(link.get('href'))
    return links

https://www.broadwayworld.com/browseshows.cfm?showtype=BR

The above link is your starting point. It will allow us to get the name of every Broadway production that opened between 1979 and now.

In [7]:
def get_show_links_year(year_url):
    """
    This function...
    """
    url = year_url
    tag_year = 'browseshows.cfm?'
    #calling previous function to get the link I want
    years = getLinks_tagged_fast(url, tag_year)[1:]
    page_base = 'https://www.broadwayworld.com/'
    years_loop =[]
    for year in years:
        #focusing on 1979 or later
        if year[-4:].isdigit() and int(year[-4:]) >= 1979:
            years_loop.append(page_base+year)
    
    # Now you have all the years
    tag_show = 'https://www.broadwayworld.com/shows/backstage.php?'
    show_links_nested = []
    for year in years_loop:
        show_links_nested.append(getLinks_tagged_fast(year,tag_show))
    show_links = sum(show_links_nested, [])
    
    return show_links
    
#running function to get my list of links to the productions I want
year_url = 'https://www.broadwayworld.com/browseshows.cfm?showtype=BR'
show_links = get_show_links_year(year_url)

In [52]:
#don't need every show ever, just 1979 onwards
#from here, you will the scrape name of each show, so you can search for it on ibdb.com using selenium
#print(show_links)
#the number of productions between 1979 and now fits the estimate I made in another assignment (roughly 1800)
len(show_links)

1877

In [47]:
def get_show_name(url):
    """
    This function...
    """
    response = requests.get(url)
    #make sure url actually exists
    assert response.status_code == 200

    show_html = response.text
    htmlparser = etree.HTMLParser()
    tree = etree.parse(io.StringIO(show_html), parser=htmlparser)
    showroot = tree.getroot()
    #this xml path is the same for every url, it will give you the name of the show for that url
    show_name = showroot.xpath("//span[@itemprop = 'name']/text()")[1]
    
    return show_name

In [49]:
#testing the function on a random url (tried this with several urls)
get_show_name('https://www.broadwayworld.com/shows/backstage.php?showid=6366')

'Oliver!'

In [50]:
#using get_show_name function to make a list of every single show I want to find data on
#took about 10 minutes to run this chunk
show_names = []
for link in show_links:
    show_name = get_show_name(link)
    show_names.append(show_name)
    
len(show_names)

1877

In [51]:
show_names

["A Doll's House",
 'Alex Edelman: Just For Us',
 'Appropriate',
 'Back to the Future: The Musical',
 'Bad Cinderella',
 'Black Orpheus',
 "Bob Fosse's Dancin'",
 'Camelot',
 'El Mago Pop',
 'Fat Ham',
 'Good Night, Oscar',
 'Grey House',
 'Gutenberg! The Musical!',
 'Harmony',
 'Here Lies Love',
 'High Noon',
 'How to Dance in Ohio',
 'I Need That',
 "Jaja's African Hair Braiding",
 'Life of Pi',
 'Melissa Etheridge: My Window',
 'Merrily We Roll Along',
 'New York, New York',
 'Once Upon a One More Time',
 'Pal Joey',
 'Parade',
 'Peter Pan Goes Wrong',
 'Pictures From Home',
 'Prima Facie',
 'Purlie Victorious',
 'Room',
 'Shucked',
 'Sing Street',
 'Spamalot',
 'Summer, 1976',
 'Sweeney Todd',
 'The Cottage',
 'The Devil Wears Prada',
 'The Shark Is Broken',
 "The Sign in Sidney Brustein's Window",
 'The Thanksgiving Play',
 '& Juliet',
 '1776',
 'A Beautiful Noise',
 'A Christmas Carol',
 'A Strange Loop',
 "Ain't No Mo'",
 'Almost Famous',
 'American Buffalo',
 'Beetlejuice',
 'B

In [20]:
#testing functions from selenium package 
#will delete this chunk later
driver = webdriver.Chrome()
productions = ['The King and I', 'Dear Evan Hansen', 'Spring Awakening']
for prod in productions:
    driver.get('https://www.ibdb.com/shows/')     #website I am searching from (IBDB)
    e = driver.find_element('name','ShowProperName')    #locating the searchbar
    e.send_keys(prod)       #automating the searches (will store results in a list later)
    e.send_keys(Keys.ENTER)
    
#^MIGHT MAKE MORE SENSE TO SEARCH FOR PRODUCTIONS FROM ADVANCED SEARCH PAGE, RATHER THAN SEARCH FOR SHOWS

In [None]:
#need to get "/broadway-production/<name-of-show-number>" for every production
#href is important
#URL = "https://www.ibdb.com/shows/" 