# Scraping a Website

In this notebook we extract all poems by Nizar Qabbani from a selected website in order to build a large corpus of text for our neural network.

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import os

In [2]:
def hyperlink_scraper(url, poem_list):
    """
    returns all hyperlinks in a webpage in the form of:
    - first link
    - second link
    - third link
    ...
    
    Input:
    - url(str): url to web page
    - poem_list(list): array of htmls that is modified inside function
    
    """
    #read the page
    html = urlopen(url)
    #creapte a soup object
    soup = BeautifulSoup(html, 'lxml')
    
    #find all hyperlinks
    hyperlinks = soup.find_all('a')
    
    #loop over hyperlinks
    for hyper in hyperlinks:
        #if filed under poetry
        if 'href="poetry.php?id=' in (str(hyper)):
            #extract portion between double quotes + extract out of list
            href = re.findall(r'"([^"]*)"', str(hyper))[0]
            #get in proper url
            poem_html = 'https://www.nizariat.com/' + href
            poem_list.append(poem_html)


In [3]:
#store poems in empty list
poem_list = []
#base format of url
base_url = 'https://www.nizariat.com/poertylist.php?page='
for i in range(1, 12):
    url = base_url+str(i)
    hyperlink_scraper(url, poem_list)

In [4]:
def write_poem(url, folder='data'):
    #open the poem page
    poem_html = urlopen(url)
    
    #make bs4 object
    poem_soup = BeautifulSoup(poem_html, 'lxml')
    
    #get text inside paragraph
    poem_par = poem_soup.findAll("div", {"class": "poettxt"})[0]
    
    #get text from page
    poem_text = poem_par.getText()

    #make textfile name
    filename = url[-3:]+'.txt'
    write_loc = os.path.join(folder, filename)
    
    #writing mode
    with open(write_loc, 'w') as f:
        f.write(poem_text)

In [5]:
#for each url write to file
for poem in poem_list:
    write_poem(poem)