In [None]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import re

## Web Scraping the Ryman Calendar

In this exercise, your objective is to use BeautifulSoup in order to obtain a dataset of upcoming events at the Ryman. This information is available at https://ryman.com/events/, but you will take the contents of this website and convert it into a pandas DataFrame.

The website splits the events across multiple pages, but start by just working on the first page. Later on in the exercise, you'll take what you've done for the first page and apply it across other pages.

In [None]:
URL = 'https://ryman.com/events/'
response = requests.get(URL)

In [None]:
#response.content

In [None]:
soup = BS(response.text)

#### Note on .text vs .content
response.text is the content of the response in Unicode, and response.content is the content of the response in bytes

#### Question 1
Start by using either the inspector or by viewing the page source. Can you identify a tag that might be helpful for finding the names of all performers? For now, just worry about the headliner and don't worry about the opener. (Eg. For Vince Gill, featuring Wendy Moten, we only care about Vince Gill.) Make use of this to create a list containing just the names of each inductee.

In [None]:
titles = soup.find_all('a', attrs={'class' : 'tribe-event-url'})
titles

Deme's code

for artist in soup.find_all("a", class_="tribe-event-url"):
    print(artist.get('title'))


In [None]:
headliners = [x.get('title') for x in titles]
headliners
#Chris's code: [x['title'] for x in titles]

#### Question 2
Next, try and find a tag that could be used to find the date and time for each show. Extract these into two lists, one containing the date and the other containing the time. (Eg. THURSDAY, AUGUST 4, 2022 AT 8:00 PM CDT should be split into August 4, 2022 and 8:00 PM CDT.) 

In [None]:
time_datetimes = soup.find_all('time')
time_datetimes

In [None]:
df_datetimes = pd.DataFrame(list(time_datetimes))

df_datetimes.columns = ['show_date_time']

df_datetimes

In [None]:
times = df_datetimes['show_date_time'].str.extract("\sat\s(.+)")
times

In [None]:
dates = df_datetimes['show_date_time'].str.extract("(.+)\sat\s")
dates

In [None]:
headliners_df = pd.DataFrame(list(headliners))
headliners_df

#### Chris's code:
[x.find('time') for x in time_soup if x.find('time)]

He did an extra step to pull p tags, so he got some "None" values initially when he pulled times out of the time_soup, this little trick with the list comprehension says to only keep values when there IS an actual value, skips over the "None"s

[x.find('time').text for x in time_soup if x.find('time)]

The version above gets him straight to the text component

date_list = []

hour_list = []

for time in time_list:

        x = time.split(' at ')
        
        date_list.append(x[0])
        
        hour_list.append(x[1])

After seeing Chris's solution, I wanted to try out doing things with lists rather than dataframes

In [None]:
time_list = [re.search("\sat\s(.+)<", str(x))[1] for x in time_datetimes]
time_list

In [None]:
date_list = [re.search(">(.+)\sat\s", str(x))[1] for x in time_datetimes]
date_list

#### Question 3
Take the two lists you created on parts 1 and 2 and convert it into a pandas DataFrame.

In [None]:
show_info1 = pd.concat([headliners_df, dates, times], axis=1)

In [None]:
show_info1.columns = ['Headliner', 'Date', 'Time']
show_info1

Playing with getting to the final dataframe in fewer steps below:

In [None]:
headliners_df = pd.DataFrame(list([x.get('title') for x in soup.findAll('a', attrs={'class' : 'tribe-event-url'})]))

df_datetimes = pd.DataFrame(list(soup.findAll('time'))).rename(columns = {0 : 'show_date_time'})

times = df_datetimes['show_date_time'].str.extract("\sat\s(.+)")

dates = df_datetimes['show_date_time'].str.extract("(.+)\sat\s")

show_info2 = pd.concat([headliners_df, dates, times], axis=1)
show_info2.columns = ['Headliner', 'Date', 'Time']

show_info2

#### Chris's code

page_1_df = pd.DataFrame({'artist' : artist_list, 'date' : date_list, 'time' : hour_list}]

He left his as lists and combined those lists into a dataframe directly. The method above shows using a dictionary to define the column name each list will have once it's in the df.

Using the lists I created after seeing Chris's solution to make a df:

In [None]:
pg1_df = pd.DataFrame({'Headliner' : list(headliners), 'Date' : date_list, 'Time' : time_list})
pg1_df

#### Question 4
Now, you need to take what you created for the first page and apply it across multiple rest of the pages so that you can scrape all inductees. Notice how the url changes when you click the "More Events" button at the top of the page. Check that the code that you wrote for the first page still works for page 2. Once you have verified that your code will still work, write a for loop that will cycle through the first five pages of events.

In [None]:
URL2 = 'https://ryman.com/events/list/?tribe_event_display=list&tribe_paged=2'
response2 = requests.get(URL2)

In [None]:
soup2 = BS(response2.text)

In [None]:
headliners2 = pd.DataFrame(list([x.get('title') for x in soup2.findAll('a', attrs={'class' : 'tribe-event-url'})]))
headliners2

In [None]:
datetimes2 = pd.DataFrame(list(soup2.findAll('time'))).rename(columns = {0 : 'show_date_time'})
datetimes2

In [None]:
for i in range(1, 6):
    if i == 1:
        all_soup = BS(requests.get('https://ryman.com/events/').text)
    else:
        all_soup.body.append(BS(requests.get(f'https://ryman.com/events/list/?tribe_event_display=list&tribe_paged={i}').text))

In [None]:
all_soup

In [None]:
all_headliners = pd.DataFrame(list([x.get('title') for x in all_soup.findAll('a', attrs={'class' : 'tribe-event-url'})]))

all_datetimes = pd.DataFrame(list(all_soup.findAll('time'))).rename(columns = {0 : 'show_date_time'})

all_times = all_datetimes['show_date_time'].str.extract("\sat\s(.+)")

all_dates = all_datetimes['show_date_time'].str.extract("(.+)\sat\s")

all_show_info= pd.concat([all_headliners, all_dates, all_times], axis=1)
all_show_info.columns = ['Headliner', 'Date', 'Time']

all_show_info

#### Chris's code

main_artist_list = []
main_date_list = []
main_time_list = []

URL2 = 'https://ryman.com/events/list/?tribe_event_display=list&tribe_paged='

for num in range (1, 6):

        new_url = URL2 + str(num)
        response = requests.get(new_url)
        soup = BS(response.content)
        
        artist_soup = soup.find_all('a', attrs = {'class' : 'tribe-event-url'})
        artist_list = [x['title'] for x in artist_soup]
        main_artist_list.extend(artist_list) #append would make a list of lists in this case, extend just adds to the list
        
        time_soup = soup.find_all('p')
        time_list = [x.find('time').text for x in time_soup if x.find('time')]
        
        for time in time_list
            z = time.split(' at ')
            main_date_list.append(x[0])
            main_time_list.append(x[1])
            
main_df = pd.DataFrame({'artist' : main_artist_list, 'date' : main_date_list, 'time': main_time_list})

#### Bonus 1
Add to your data frame the opening act for all shows that list an opener.

In [None]:
div = all_soup.findAll('div', attrs={'class' : 'tribe-beside-image'})
div
#Need to do this so that when there are multiple instances of using opener for a single show, they are within the same block

In [None]:
len(div[1].find_all("span"))

In [None]:
clean_openers = []
clean_openers

In [None]:
clean_openers = []
for x in div:
    if len(x.find_all("span")) != 0:
        if re.search("with.+", str(x.find_all("span")[-1])):
            clean_openers.append(x.find_all("span")[-1].text)
        elif re.search("With.+", str(x.find_all("span")[-1])):
            clean_openers.append(x.find_all("span")[-1].text)
        elif re.search("featuring.+", str(x.find_all("span")[-1])):
            clean_openers.append(x.find_all("span")[-1].text)
        else:
            clean_openers.append("No Opener")
    else:
            clean_openers.append("No Opener")

clean_openers_df = pd.DataFrame(clean_openers)

clean_openers_df

In [None]:
all_show_info= pd.concat([all_headliners, clean_openers_df, all_dates, all_times], axis=1)
all_show_info.columns = ['Headliner', 'Opener', 'Date', 'Time']
all_show_info

#### Bonus 2
If you click the "MORE INFO" button for an event, it will take you to a page which shows ticket prices. Write code that can be used to retrieve the ticket prices for each show that you have scraped. Make sure that your code can handle cases where the show has been canceled (eg. https://ryman.com/event/nhabit-worship-experience/).

In [None]:
buttons = all_soup.findAll('a', attrs={'class' : 'smallblackbutton'})
buttons

In [None]:
links = [x.get('href') for x in buttons]
links

In [None]:
prices = []
for l in links:
    if re.search('canceled', str(BS(requests.get(l).text).find('strong', attrs={'class' : 'show-status-label'}))):
        prices.append("Show Cancelled")
    elif re.search("\w", str(BS(requests.get(l).text).find_all('p', attrs={'class' : 'theprices'}))):
        prices.append(BS(requests.get(l).text).find('p', attrs={'class' : 'theprices'}).text)
    else:
        prices.append("See website for details")
    
prices

In [None]:
#Doing this so that the $ show up properly in the dataframe
prices_escape = [i.replace("$", "\$") for i in prices]

In [None]:
price_df = pd.DataFrame(prices_escape)
price_df.columns = ['Pricing Information']
price_df

In [None]:
final_show_info = pd.concat([all_show_info, price_df], axis=1)
final_show_info