In [12]:
import requests, re
from bs4 import BeautifulSoup



In [13]:
BASE_URL = 'http://quotes.toscrape.com'



In [14]:
def get_links(url):
    authors = []
    # Get page located at url:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get all links corresponding to authors:
    author_links = soup.find_all('a', href=re.compile(r'/author/'))
    
    # Loop over these:
    for link in author_links:
        # Create full URL from relative link
        author_url = BASE_URL + link['href']
        # if a link is not in authors, add it:
        if author_url not in authors:
            authors.append(author_url)
            
    # Return results
    return authors

# Test:
authors = get_links(BASE_URL)
print(authors)


['http://quotes.toscrape.com/author/Albert-Einstein', 'http://quotes.toscrape.com/author/J-K-Rowling', 'http://quotes.toscrape.com/author/Jane-Austen', 'http://quotes.toscrape.com/author/Marilyn-Monroe', 'http://quotes.toscrape.com/author/Andre-Gide', 'http://quotes.toscrape.com/author/Thomas-A-Edison', 'http://quotes.toscrape.com/author/Eleanor-Roosevelt', 'http://quotes.toscrape.com/author/Steve-Martin']


In [15]:
def collect(url, authors, limit=None):
    # Add links contained in page located at url to the authors being computed
    authors_on_page = get_links(url)
    authors.extend([x for x in authors_on_page if x not in authors])
    
    if limit is not None and limit <= 1:
        return

    # Get page located at url:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Get url of next page
    next_button = soup.find('li', class_='next')
    if next_button:
        next_url = BASE_URL + next_button.find('a')['href']
        
        # Recursively collect links (if any)
        collect(next_url, authors, None if limit is None else limit-1)

# Test
authors = []
collect(BASE_URL, authors, limit=3)
print(authors)


['http://quotes.toscrape.com/author/Albert-Einstein', 'http://quotes.toscrape.com/author/J-K-Rowling', 'http://quotes.toscrape.com/author/Jane-Austen', 'http://quotes.toscrape.com/author/Marilyn-Monroe', 'http://quotes.toscrape.com/author/Andre-Gide', 'http://quotes.toscrape.com/author/Thomas-A-Edison', 'http://quotes.toscrape.com/author/Eleanor-Roosevelt', 'http://quotes.toscrape.com/author/Steve-Martin', 'http://quotes.toscrape.com/author/Bob-Marley', 'http://quotes.toscrape.com/author/Dr-Seuss', 'http://quotes.toscrape.com/author/Douglas-Adams', 'http://quotes.toscrape.com/author/Elie-Wiesel', 'http://quotes.toscrape.com/author/Friedrich-Nietzsche', 'http://quotes.toscrape.com/author/Mark-Twain', 'http://quotes.toscrape.com/author/Allen-Saunders', 'http://quotes.toscrape.com/author/Pablo-Neruda', 'http://quotes.toscrape.com/author/Ralph-Waldo-Emerson', 'http://quotes.toscrape.com/author/Mother-Teresa', 'http://quotes.toscrape.com/author/Garrison-Keillor', 'http://quotes.toscrape.com

Test

In [16]:
for author in authors:
    response = requests.get(author)
    soup = BeautifulSoup(response.text,"html.parser")
    names = soup.find('h3', class_='author-title')
    for name in names:
        print(name)

Albert Einstein
J.K. Rowling
Jane Austen
Marilyn Monroe
André Gide
Thomas A. Edison
Eleanor Roosevelt
Steve Martin
Bob Marley
Dr. Seuss
Douglas Adams
Elie Wiesel
Friedrich Nietzsche
Mark Twain
Allen Saunders
Pablo Neruda
Ralph Waldo Emerson
Mother Teresa
Garrison Keillor
Jim Henson


Question 1.3 : get actual biographies
For each of the links computed in the previous question, retrieve the corresponding webpage and extract the biography it contains. To do so, fill the get_biography function below. It will feed a list of dictionaries of the following form:

bios = [{name: '...', birth_date: '...', birth_place: '...', bio: '...'}, ...]

In [17]:
def get_biography(url):
    # Get page located at URL and parse it
    response = requests.get(url)
    # Get name with BeautifulSoup
    
    soup = BeautifulSoup(response.text,'html.parser')
    name = soup.find('h3', class_='author-title').get_text()
    # Get birth date
    birth_date = soup.find('span',class_='author-born-date').get_text()
    # Get birth place
    birth_place= soup.find('span',class_='author-born-location').get_text()
    # Get bio
    bio = soup.find('div',class_='author-description').get_text()
    return {'name':name, 'birth_date': birth_date, 'birth_place': birth_place, 'bio': bio}

def get_bios(urls):
    bios = []
    for u in urls:
        bios.append(get_biography(u))
    return bios

#Test
bios=get_bios(authors)
print(bios)

[{'name': 'Albert Einstein', 'birth_date': 'March 14, 1879', 'birth_place': 'in Ulm, Germany', 'bio': '\n        In 1879, Albert Einstein was born in Ulm, Germany. He completed his Ph.D. at the University of Zurich by 1909. His 1905 paper explaining the photoelectric effect, the basis of electronics, earned him the Nobel Prize in 1921. His first paper on Special Relativity Theory, also published in 1905, changed the world. After the rise of the Nazi party, Einstein made Princeton his permanent home, becoming a U.S. citizen in 1940. Einstein, a pacifist during World War I, stayed a firm proponent of social justice and responsibility. He chaired the Emergency Committee of Atomic Scientists, which organized to alert the public to the dangers of atomic warfare.At a symposium, he advised: "In their struggle for the ethical good, teachers of religion must have the stature to give up the doctrine of a personal God, that is, give up that source of fear and hope which in the past placed such va

Question 1.4: save your dataset
Finally, write a save function which takes as an input a list of biographies as computed above and save them in JSON on disk (the filename being an input parameter).

In [18]:
import json

def save(filename, dataset):
    with open(filename,'w') as file:
        json.dump(dataset,file)
    

save('bios.json', bios)

**Scrapy**

In [19]:
import scrapy
import requests, re


In [20]:
name = "wikipedia"

start_urls = ["https://en.wikipedia.org/wiki/List_of_French_artists"]


In [21]:
class WikipediaSpider(scrapy.Spider): #Something is missing here. What exactly?
    name = "wikipedia"

    start_urls = ["https://en.wikipedia.org/wiki/List_of_French_artists"]
    def parse(self, response):
        #get all links which are in lists on the page
        links = response.css('table.sidebar.nomobile.nowraplinks.plainlist ul li a::attr(href)').getall()

        


In [22]:
import scrapy
import json

class WikipediaSpider(scrapy.Spider): #Something is missing here. What exactly?
    name = "wikipedia"

    start_urls = ["https://en.wikipedia.org/wiki/List_of_French_artists"]

    def parse(self, response):
        links = response.css('table.sidebar.nomobile.nowraplinks.plainlist ul li a::attr(href)').getall()

        list_see_also = response.css('span#See_also + span a::attr(href)').getall()

        res_list = list(set(list_els) - set(list_see_also))
        for link in res_list:
            #check that the link actually exists and is not red
            if 'class="new"' not in link:
                yield response.follow(link, callback=self.parse_artist)
        
    def parse_artist(self, response):
        url = #get url of the page
        name = # get name of the artist
        paragraph = # get the first paragraph
        yield {'url': url,
               'name': name,
               'paragraph': paragraph}
        
        
if __name__=='__main__':
    import scrapy.crawler
    
    process = scrapy.crawler.CrawlerProcess({
        'USER_AGENT': 'Mozilla/5.0',
        'FEEDS': {
            "artists.json": {"format": "json"},
        },
    })
    process.crawl(WikipediaSpider)
    process.start()
    process.stop()

SyntaxError: invalid syntax (3210075805.py, line 21)