In [337]:
# Our jupyter/datascience-notebook Docker container comes with 
# BeautifulSoup4 and requests, both popular libraries!

from bs4 import BeautifulSoup
import requests

In [338]:
START_URL = 'https://brickset.com/sets/year-2016'


In [339]:
r = requests.get(START_URL)

In [340]:
START_URL

'https://brickset.com/sets/year-2016'

In [341]:
soup = BeautifulSoup(r.text, 'html.parser')
type(soup)

bs4.BeautifulSoup

In [342]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="LEGO set database: 2016 " name="description"/>
  <meta content="width=device-width, minimum-scale=1.0, maximum-scale=1.0" id="viewport" name="viewport"/>
  <link href="//brickset.com/feed/" rel="alternate" title="Brickset news and activity feed" type="application/rss+xml"/>
  <link href="https://brickset.com/sets/year-2016" rel="canonical"/>
  <meta content="New server" name="temp"/>
  <link href="/assets/images/icons/apple-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
  <link href="/assets/images/icons/apple-icon-60x60.png" rel="a

In [343]:
soup.title.string

'2016  | Brickset: LEGO set guide and database'

In [344]:
metas = soup.find_all("div", {"class":"meta"})
print(metas)

[<div class="meta"><h1><a href="/sets/10251-1/Brick-Bank"><span>10251: </span> Brick Bank</a></h1><div class="tags"><a href="/sets/10251-1/Brick-Bank">10251-1</a> <a href="/sets/theme-Advanced-Models">Advanced Models</a> <a class="subtheme" href="/sets/theme-Advanced-Models/subtheme-Modular-Buildings">Modular Buildings</a> <a class="year" href="/sets/theme-Advanced-Models/year-2016">2016</a> </div><div class="tags"><span id="tags24994"><a href="/sets/tag-Bank">Bank</a> <a href="/sets/tag-Baseplate"> Baseplate</a> <a href="/sets/tag-Brick-Built-Tree"> Brick Built Tree</a> <a href="/sets/tag-Camera"> Camera</a> <a href="/sets/tag-Chandelier"> Chandelier</a> <a href="/sets/tag-Coffee-Machine"> Coffee Machine</a> <a href="/sets/tag-Creator-Expert"> Creator Expert</a> <a href="/sets/tag-D2c"> D2c</a> <a href="/sets/tag-Lamppost"> Lamppost</a> <a href="/sets/tag-Laundromat"> Laundromat</a> <a href="/sets/tag-Modular-Building"> Modular Building</a> <a href="/sets/tag-Safe"> Safe</a> <a href="

In [347]:
def get_titles(soup):
    metas = soup.find_all("div", {"class":"meta"})
    titles=[]
    #print(metas)
    for item in metas:
        titles.append(item.h1.text)
        
    next_link = soup.select("li.next a")[0]["href"]
    return titles

In [348]:
def parse_bricks(url):
    request = requests.get(url)
    parsed_request = BeautifulSoup(request.content, 'html.parser')
    titles = get_titles(parsed_request)
    return titles

In [349]:
bricks = parse_bricks(START_URL)

In [350]:
print(bricks)

['10251:  Brick Bank', '10252:  Volkswagen Beetle', '10253:  Big Ben', '10254:  Winter Holiday Train', '10654:  XL Creative Brick Box', '10702:  Creative Building Set', '10705:  Creative Building Basket', '10720:  Police Helicopter Chase', '10721:  Iron Man vs. Loki', '10722:  Snake Showdown', "10723:  Ariel's Dolphin Carriage", '10724:  Batman & Superman vs. Lex Luthor', '10725:  Lost Temple', "10726:  Stephanie's Horse Carriage", "10727:  Emma's Ice Cream Truck", "10728:  Mia's Vet Clinic", "10729:  Cinderella's Carriage", '10801:  Baby Animals', '10802:  Savanna', '10803:  Arctic', '10804:  Jungle', '10805:  Around the World', '10806:  Horses', '10807:  Horse Trailer', '10808:  Little Plane']


In [351]:
assert(bricks[0] == '10251:  Brick Bank')
assert(bricks[9] == '10722:  Snake Showdown')

In [352]:
print(bricks[0])

10251:  Brick Bank


In [353]:
print(bricks[9])

10722:  Snake Showdown


In [354]:
urls = []
for x in range(1, 33):
    urls.append("https://brickset.com/sets/year-2016/page-"+str(x))

In [355]:
print(urls)

['https://brickset.com/sets/year-2016/page-1', 'https://brickset.com/sets/year-2016/page-2', 'https://brickset.com/sets/year-2016/page-3', 'https://brickset.com/sets/year-2016/page-4', 'https://brickset.com/sets/year-2016/page-5', 'https://brickset.com/sets/year-2016/page-6', 'https://brickset.com/sets/year-2016/page-7', 'https://brickset.com/sets/year-2016/page-8', 'https://brickset.com/sets/year-2016/page-9', 'https://brickset.com/sets/year-2016/page-10', 'https://brickset.com/sets/year-2016/page-11', 'https://brickset.com/sets/year-2016/page-12', 'https://brickset.com/sets/year-2016/page-13', 'https://brickset.com/sets/year-2016/page-14', 'https://brickset.com/sets/year-2016/page-15', 'https://brickset.com/sets/year-2016/page-16', 'https://brickset.com/sets/year-2016/page-17', 'https://brickset.com/sets/year-2016/page-18', 'https://brickset.com/sets/year-2016/page-19', 'https://brickset.com/sets/year-2016/page-20', 'https://brickset.com/sets/year-2016/page-21', 'https://brickset.com

In [383]:
##First way of doing it

titles_list = []

urls = []
for x in range(1, 33):
    urls.append("https://brickset.com/sets/year-2016/page-"+str(x))
    
def parse_bricks(url):
    for url in urls:
        request = requests.get(url)
        parsed_request = BeautifulSoup(request.content, 'html.parser')
        titles = get_titles(parsed_request)
        
        for i in range(0, len(titles)):
            titles_list.append(titles[i])
    return titles_list

In [384]:
parse_bricks(urls)

['10251:  Brick Bank',
 '10252:  Volkswagen Beetle',
 '10253:  Big Ben',
 '10254:  Winter Holiday Train',
 '10654:  XL Creative Brick Box',
 '10702:  Creative Building Set',
 '10705:  Creative Building Basket',
 '10720:  Police Helicopter Chase',
 '10721:  Iron Man vs. Loki',
 '10722:  Snake Showdown',
 "10723:  Ariel's Dolphin Carriage",
 '10724:  Batman & Superman vs. Lex Luthor',
 '10725:  Lost Temple',
 "10726:  Stephanie's Horse Carriage",
 "10727:  Emma's Ice Cream Truck",
 "10728:  Mia's Vet Clinic",
 "10729:  Cinderella's Carriage",
 '10801:  Baby Animals',
 '10802:  Savanna',
 '10803:  Arctic',
 '10804:  Jungle',
 '10805:  Around the World',
 '10806:  Horses',
 '10807:  Horse Trailer',
 '10808:  Little Plane',
 '10809:  Police Patrol',
 '10810:  Push Train',
 '10811:  Backhoe Loader',
 '10812:  Truck & Tracked Excavator',
 '10813:  Big Construction Site',
 '10814:  Tow Truck',
 '10815:  My First Rocket',
 '10816:  My First Cars and Trucks',
 '10817:  Creative Chest',
 '10818: 

In [390]:
##Second way of doing it

START_URL = 'https://brickset.com/sets/year-2016'
urls = [START_URL]
links = soup.find_all('li', class_ = 'page')
for link in links:
        link=link.find("a")
        link=link["href"]
        urls.append(link)

def parse_bricks(url):
    for url in urls:
        request = requests.get(url)
        parsed_request = BeautifulSoup(request.content, 'html.parser')
        titles = get_titles(parsed_request)
        
        for i in range(0, len(titles)):
            titles_list.append(titles[i])
    return titles_list

In [391]:
parse_bricks(urls)

['10251:  Brick Bank',
 '10252:  Volkswagen Beetle',
 '10253:  Big Ben',
 '10254:  Winter Holiday Train',
 '10654:  XL Creative Brick Box',
 '10702:  Creative Building Set',
 '10705:  Creative Building Basket',
 '10720:  Police Helicopter Chase',
 '10721:  Iron Man vs. Loki',
 '10722:  Snake Showdown',
 "10723:  Ariel's Dolphin Carriage",
 '10724:  Batman & Superman vs. Lex Luthor',
 '10725:  Lost Temple',
 "10726:  Stephanie's Horse Carriage",
 "10727:  Emma's Ice Cream Truck",
 "10728:  Mia's Vet Clinic",
 "10729:  Cinderella's Carriage",
 '10801:  Baby Animals',
 '10802:  Savanna',
 '10803:  Arctic',
 '10804:  Jungle',
 '10805:  Around the World',
 '10806:  Horses',
 '10807:  Horse Trailer',
 '10808:  Little Plane',
 '10809:  Police Patrol',
 '10810:  Push Train',
 '10811:  Backhoe Loader',
 '10812:  Truck & Tracked Excavator',
 '10813:  Big Construction Site',
 '10814:  Tow Truck',
 '10815:  My First Rocket',
 '10816:  My First Cars and Trucks',
 '10817:  Creative Chest',
 '10818: 