# Web scraping 

In [53]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

### Extracting Text

In [55]:
response = requests.get("https://poestories.com/poetry.php")
html_string = response.text

In [70]:
document = BeautifulSoup(html_string, "html.parser")
poem_title_tags = document.find_all('a')

poem_titles = []
publishing_dates = []
for tag in poem_title_tags:
    href = tag.get('href')
    title = tag.text
    date = tag.next_sibling
    if href and title and href.startswith('/read/'):
        poem_titles.append(title)
        publishing_dates.append(date)

print(poem_titles)
print(publishing_dates)

['"Alone"', '"Annabel Lee"', '"The Bells"', '"The City in the Sea"', '"The Conqueror Worm"', '"Dream-Land"', '"A Dream Within A Dream"', '"Eldorado"', '"For Annie"', '"The Haunted Palace"', '"Lenore"', '"The Raven"', '"The Sleeper"', '"Sonnet - To Science"', '"Spirits of the Dead"', '"To The River"', '"A Valentine"', '"The Valley of Unrest"']
[' (1875) ', ' (1849) ', ' (1849) ', ' (1831) ', ' (1843) ', ' (1844) ', ' (1850) ', ' (1849) ', ' (1849) ', ' (1839) ', ' (1845) ', ' (1845) ', ' (1831) ', ' (1845) ', ' (1829) ', ' (1829) ', ' (1850) ', ' (1845) ']


### Cleaning Text

In [72]:
def remove_punc(text):
    punctuation = '!@#$%^&*()_-+={}[]:;"\'|<>,.?/~`'
    return ''.join(character for character in text
                   if character not in punctuation)

clean_poem_titles = [remove_punc(poem) for poem in poem_titles]
clean_pub_dates = [remove_punc(date).strip() for date in publishing_dates]

print(clean_poem_titles)
print(clean_pub_dates)

['Alone', 'Annabel Lee', 'The Bells', 'The City in the Sea', 'The Conqueror Worm', 'DreamLand', 'A Dream Within A Dream', 'Eldorado', 'For Annie', 'The Haunted Palace', 'Lenore', 'The Raven', 'The Sleeper', 'Sonnet  To Science', 'Spirits of the Dead', 'To The River', 'A Valentine', 'The Valley of Unrest']
['1875', '1849', '1849', '1831', '1843', '1844', '1850', '1849', '1849', '1839', '1845', '1845', '1831', '1845', '1829', '1829', '1850', '1845']


### Creating the Corpus

In [60]:
d = {'Poem Title':clean_poem_titles,'Publishing Year':clean_pub_dates}
poems_df = pd.DataFrame(d)

poems_df.to_csv('EAP_poems.csv')