In [1]:
import requests
from bs4 import BeautifulSoup
import unicodedata
import pickle


In [2]:
class Scraper():
    def __init__(self, base_url):
        self.base_url = base_url
    
    # creates query with the base url
    def create_query(self, query):
        return self.base_url+query
    
    # creates query to search by author
    def create_author_query(self, author_name, page=1):
        author = author_name.lower().replace(' ', '%20')
        
        url_ref = f'{self.base_url}/search?query={author}&refinement=poems&page={page}'
        
        return url_ref
    

In [3]:

def get_poems_of_author(url, author_name):

    poems = []
    
    pg = 1
    while(True):
        
        authors_poems = url.create_author_query(author_name, page=pg) # create url for author name and page number
        response = requests.get(authors_poems) # query url
        
        if response.status_code == 200: # author and page exists
            
            # parse on each page
            soup = BeautifulSoup(response.content, 'html.parser')
            poems = poems + parse_poems_of_author_page(soup, author_name)
            
            pg += 1
            
        else: # author or page does not exist
            break
            
    return poems
        
    
    
def parse_poems_of_author_page(soup, author_name):    
    # Find the HTML elements that contain the poem titles, authors, and links to the poems
    poem_blocks = soup.find_all('div', class_='c-feature c-mix-feature_shrinkwrap') #, class_='c-feature-preamble')

    # Loop through the "poem blocks"
    poems = []
    
    for poem in poem_blocks:
        
        # find author's name
        find_author = poem.find('span', class_='c-txt c-txt_attribution')
        
        # successfully found an author
        if find_author:
            author = find_author.text.strip()[3:] # get author's name
            
            if (author.lower() == author_name.lower()): # verify poem is by author
                
                title_find = poem.find('h2', class_='c-hdgSans c-hdgSans_2') # find title of poem
                href = title_find.find('a')['href']
                title = title_find.text # get title
                poem = parse_poem_page(url, href) # get poem
                title_normalized = unicodedata.normalize('NFKC', title).strip()
                poems.append([title_normalized, poem])
                # print(author, title)
        else:
            continue
    
    return poems

def parse_poem_page(url, href):
    poem_url = url.create_query(href) # create poem url query
    response = requests.get(poem_url) # get response
    soup = BeautifulSoup(response.content, 'html.parser') # get content of page
    
    poem = []
    
    lines = soup.find_all('div', attrs={'style': 'text-indent: -1em; padding-left: 1em;'}) # find where poem is contained
    for line in lines:
        poem.append(line.text.strip()) # add each line of poem
    
    
    return poem


        # print(author)
    # print(author.span.text.strip()[3:])
    
    # print(poem)
#     title = poem.find('a', class_='c-hdgSans c-hdgSans--three').text
#     author = poem.find('div', class_='c-txt-grayLight').text
#     link = poem.find('a', class_='c-card__img-link')['href']
    
#     # Do something with the extracted data, such as printing it to the console
#     print(title, author, link)

In [4]:
website = 'https://www.poetryfoundation.org'
url = Scraper(base_url=website)


In [8]:
poems = get_poems_of_author(url, 'william shakespeare')
# parse_poem_page(url, '/poems/90067/sonnet-12-when-i-do-count-the-clock-that-tells-the-time-578cfa272532b')


In [9]:
shakespeare_poems = {i[0] : i[1] for i in poems}

In [10]:
shakespeare_poems

{}

In [42]:
pickle.dump(shakespeare_poems, open( "shakespeare_poems.p", "wb" ))

In [44]:
test = pickle.load(open( "shakespeare_poems.p", "rb" ))