In [None]:
import requests
from bs4 import BeautifulSoup as BS 
import pandas as pd

In [None]:
# the page we're pulling from
URL = 'https://ryman.com/events/'

# grabs all the code that makes the website
response = requests.get(URL)

# turns it into soup so we can use BS functions to find what we are looking for
soup = BS(response.text)

### 1. Start by using either the inspector or by viewing the page source. Can you identify a tag that might be helpful for finding the names of all performers? For now, just worry about the headliner and don't worry about the opener. (Eg. For Vince Gill, featuring Wendy Moten, we only care about Vince Gill.) Make use of this to create a list containing just the names of each inductee.

In [None]:
# example element that holds the information for a name:
# <a class="tribe-event-url" href="https://ryman.com/event/lynyrd-skynyrd/" title="Lynyrd Skynyrd" rel="bookmark">Lynyrd Skynyrd</a>

# when inspecting, the names are all in 'a' tags.
# however, a ton of stuff is stored in 'a' tags, not just the stuff we want
# the attribute 'class' with the value 'tribe-event-url' is unique to the main act names
# using findAll, the first argument is the tag we're searching for
# the second is a dictionary with the attribute and value we want
name_soup = soup.findAll('a', {'class': 'tribe-event-url'})
name_soup

In [None]:
# it makes sense to run a test on the first instance of our list of elements to decide how to pull out text 
# before creating a loop / list comprehension to grab the text in every element
name_test = name_soup[0].text
name_test

In [None]:
# using the string method .strip() to take off all the extra garbage we don't want
# remember '\' indicates something special is happening in code, 
# so that's how .strip() knows we can get rid of it
name_test_clean = name_soup[0].text.strip()
name_test_clean

In [None]:
# list comprehension to run all of this code on each instance in the list
names = [name.text.strip() for name in name_soup]
names

### 2. Next, try and find a tag that could be used to find the date and time for each show. Extract these into two lists, one containing the date and the other containing the time. (Eg. THURSDAY, AUGUST 4, 2022 AT 8:00 PM CDT should be split into August 4, 2022 and 8:00 PM CDT.) 

In [None]:
# example element that holds the information for a name:
# <time datetime="2022-11-13 07:30:00 CST">Sunday, November 13, 2022 at 7:30 PM CST</time>
# the tag here is 'time'
# luckily in this case, the only tag that holds time is what we're looking for, so we don't need to worry about attributes
time_soup = soup.findAll('time')
time_soup

In [None]:
# test on one element
time_soup[0].text

In [None]:
#list comprehension to get all of them
times = [time.text for time in time_soup]
times

In [None]:
# testing code on the first item in our list 'times' before creating a loop to make sure it will work
# the first index [0] is to grab the first item in times, the second indeces [0] and [1] are to indicate
# the first half of the split will be our date and the second half will be our time
# (look at the output of the last print statement to see what the split looks like before we use the indeces to break it
# into two variables)
test_split_date = times[0].split(" at ")[0]
test_split_oclock = times[0].split(" at ")[1]
print(test_split_date)
print(test_split_oclock)

# to save time, you can also use this method:
test_split_date_2, test_split_oclock_2 = times[0].split(" at ")
print(test_split_date_2)
print(test_split_oclock_2)

print(times[0].split(" at "))

In [None]:
# loop to split the dates and times into two separate lists

# initiate empty lists to append to
dates = []
oclocks = []

# iterate over the times list we created in the cell above
for time in times:

# outside of a loop, this is where we would say 'date = time.split(" at ")[0]' to create a variable with that one value
# when running a loop, a common method of collecting these individual variables into a list is by changing this code to 
# the following using .append() in conjunction with the empty list we created outside the loop
    dates.append(time.split(" at ")[0])
    oclocks.append(time.split(" at ")[1])
    
print(dates)
print(oclocks)

In [None]:
# the same code as above without all the comments so it is easier to read

dates = []
oclocks = []

for time in times:

    dates.append(time.split(" at ")[0])
    oclocks.append(time.split(" at ")[1])
    
print(dates)
print(oclocks)

### 3. Take the two lists you created on parts 1 and 2 and convert it into a pandas DataFrame.

In [None]:
# use a dictionary with pd.DataFrame to turn the lists into columns and give the columns names
events = pd.DataFrame(data = {
    'act': names,
    'date': dates,
    'times': oclocks
})

events

### 4. Now, you need to take what you created for the first page and apply it across multiple rest of the pages so that you can scrape all inductees. Notice how the url changes when you click the "More Events" button at the top of the page. Check that the code that you wrote for the first page still works for page 2. Once you have verified that your code will still work, write a for loop that will cycle through the first five pages of events.

In [None]:
# initiate empty lists to append to
names = []
dates = []
oclocks = []

# the range of pages we want to loop over
for page in range(1,5):
    
# since we know the base URL is the same over all pages with the only difference being the page going up by one each page,
# we can use an f string to write out the URL with the variable 'page' that will go up by one in our range on each iteration
    url = f'https://ryman.com/events/list/?tribe_event_display=list&tribe_paged={page}'

# this is always the same two steps to grab the website code and turn it into soup
# putting it in the loop means we will get the soup for the page the loop is on and then the soup will be replaced
# on the next loop through
    response = requests.get(URL)
    soup = BS(response.text)
    
# this is the exact code we used to get our soup of just names in question one
    name_soup = soup.findAll('a', {'class': 'tribe-event-url'})
# the reason I am using extend this time instead of append is a little much to explain in a comment,
# but I am happy to explain any time
# we are able to use the same exact list comprehension we used in question one inside the .extend()
    names.extend([name.text.strip() for name in name_soup])

# again, same code as question two
    time_soup = soup.findAll('time')
    times = [time.text for time in time_soup]

# same exact loop we used in question two
    for time in times:
        dates.append(time.split(" at ")[0])
        oclocks.append(time.split(" at ")[1])

# creation of our final dataframe using the lists we just got finished creating in the nested loop above
# it is important to make sure this step starts all the way to the left of the cell block instead of being indented
# this is how python knows this is not to be run in the loop, but instead after the loop is finished running
five_pages_df = pd.DataFrame({
    'act': names,
    'dates': dates,
    'oclocks': oclocks
})

In [None]:
# the same code as above without all the comments so it is easier to read

names = []
dates = []
oclocks = []

for page in range(1,5):

    url = f'https://ryman.com/events/list/?tribe_event_display=list&tribe_paged={page}'
    response = requests.get(URL)
    soup = BS(response.text)
    
    name_soup = soup.findAll('a', {'class': 'tribe-event-url'})
    names.extend([name.text.strip() for name in name_soup])

    time_soup = soup.findAll('time')
    times = [time.text for time in time_soup]

    for time in times:
        dates.append(time.split(" at ")[0])
        oclocks.append(time.split(" at ")[1])

five_pages_df = pd.DataFrame({
    'act': names,
    'dates': dates,
    'oclocks': oclocks
})

In [None]:
five_pages_df