# Brickset Sets Data

Workflow: 3   

Goal: Loop over the sets' url data and get all the data fields.    

Result: The set urls are split into 4 jobs, the result of each job is output as a csv.

In [1]:
import os
import pandas as pd

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')
import lego_helper as lh

## Scrape One Set

In [4]:
# get data from a new set

TEST_URL = '/sets/10255-1/Assembly-Square'
set1_dx = lh.get_set_data(TEST_URL)
set1_dx

{'set_no': '10255-1',
 'name': 'Assembly Square',
 'url': '/sets/10255-1/Assembly-Square',
 'set_type': 'Normal',
 'theme_group': 'Model making',
 'theme': 'Creator Expert',
 'subtheme': 'Modular Buildings',
 'year': '2017',
 'tags': 'Anniversary Set, Apartment, Baby, Baked Goods, Ballet, Baseplate, Bathroom, Bbq, Camera, Carriage, Coffee Machine, Creator Expert, D2c, Dentist, Dog, Fine Arts, Fountain, Kitchen, Lamppost, Lego Tower, Microscale, Modular Building, Musical, Parrot, Restaurant, Shop, Wedding',
 'piece_cnt': '4002',
 'minifig_cnt': '9',
 'inventory_url': '/inventories/10255-1',
 'minifig_url': '/minifigs/inset-10255-1',
 'store_price': '£179.99, $279.99, 239.99€',
 'current_price': 'New: $215, Used: $185',
 'packaging': 'Box',
 'notes': 'Celebrates ten years of the Modular Buildings line.Connects with 10182-1 10185-1 10190-1 10197-1 10232-1 ',
 'rating_value': '5.0',
 'rating_votes': '5'}

In [5]:
# get data from a test set

TEST_URL = '/sets/722-1/Universal-Building-Set-7'  #'/sets/KC028-1/C-3PO'
set2_dx = lh.get_set_data(TEST_URL)
set2_dx

{'set_no': '722-1',
 'name': 'Universal Building Set, 7+',
 'url': '/sets/722-1/Universal-Building-Set-7',
 'set_type': 'Normal',
 'theme_group': 'Basic',
 'theme': 'Basic',
 'subtheme': None,
 'year': '1980',
 'tags': '4X4, Aircraft, Articulated Lorry, Boat, Car, Crane, Helicopter, Steam Roller, Trailer, Truck',
 'piece_cnt': '301',
 'minifig_cnt': 0,
 'inventory_url': None,
 'minifig_url': None,
 'store_price': '$20.00',
 'current_price': 'New: $200, Used: $78',
 'packaging': None,
 'notes': None,
 'rating_value': '4.4',
 'rating_votes': '5'}

In [6]:
set_ls = [set1_dx, set2_dx]
set_df = pd.DataFrame(set_ls)
set_df = set_df[['set_no', 'name', 'url', 
                 'theme_group', 'theme', 'subtheme',  
                 'year', 'piece_cnt', 'minifig_cnt', 'inventory_url', 'minifig_url', 
                 'store_price', 'current_price', 'rating_value', 'rating_votes', 
                 'tags', 'set_type', 'packaging', 'notes',]]
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,url,theme_group,theme,subtheme,year,piece_cnt,minifig_cnt,inventory_url,minifig_url,store_price,current_price,rating_value,rating_votes,tags,set_type,packaging,notes
0,10255-1,Assembly Square,/sets/10255-1/Assembly-Square,Model making,Creator Expert,Modular Buildings,2017,4002,9,/inventories/10255-1,/minifigs/inset-10255-1,"£179.99, $279.99, 239.99€","New: $215, Used: $185",5.0,5,"Anniversary Set, Apartment, Baby, Baked Goods,...",Normal,Box,Celebrates ten years of the Modular Buildings ...
1,722-1,"Universal Building Set, 7+",/sets/722-1/Universal-Building-Set-7,Basic,Basic,,1980,301,0,,,$20.00,"New: $200, Used: $78",4.4,5,"4X4, Aircraft, Articulated Lorry, Boat, Car, C...",Normal,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 19 columns):
set_no           2 non-null object
name             2 non-null object
url              2 non-null object
theme_group      2 non-null object
theme            2 non-null object
subtheme         1 non-null object
year             2 non-null object
piece_cnt        2 non-null object
minifig_cnt      2 non-null object
inventory_url    1 non-null object
minifig_url      1 non-null object
store_price      2 non-null object
current_price    2 non-null object
rating_value     2 non-null object
rating_votes     2 non-null object
tags             2 non-null object
set_type         2 non-null object
packaging        1 non-null object
notes            1 non-null object
dtypes: object(19)
memory usage: 384.0+ bytes


## Scrape All Sets

In [7]:
URL_FILE = '../data/brickset_set_url.csv'
url_df = pd.read_csv(URL_FILE)
url_df.head()
url_df.info()

Unnamed: 0,set_no,name,url
0,722,"Universal Building Set, 7+",/sets/722-1/Universal-Building-Set-7
1,733,"Universal Building Set, 7+",/sets/733-1/Universal-Building-Set-7
2,744,"Universal Building Set with Motor, 7+",/sets/744-1/Universal-Building-Set-with-Motor-7
3,820,Red Plates Parts Pack,/sets/820-1/Red-Plates-Parts-Pack
4,822,Blue Plates Parts Pack,/sets/822-1/Blue-Plates-Parts-Pack


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13899 entries, 0 to 13898
Data columns (total 3 columns):
set_no    13899 non-null object
name      13897 non-null object
url       13899 non-null object
dtypes: object(3)
memory usage: 325.8+ KB


In [8]:
# divide it into jobs for better error handling

url_0k_df = url_df[0:3499]
url_3k_df = url_df[3500:6999]
url_7k_df = url_df[7000:10499]
url_10k_df = url_df[10500:]

url_0k_df.shape
url_3k_df.shape
url_7k_df.shape
url_10k_df.shape

(3499, 3)

(3499, 3)

(3499, 3)

(3399, 3)

In [9]:
set_ls = []

for idx, row in url_7k_df.iterrows():
    set_url = row['url']
    set_name = row['name']
    if idx % 200 == 0:
        print(f'{idx}: {set_url}')

    try:
        cset_dx = lh.get_set_data(set_url)
        set_ls.append(cset_dx)
    except Exception as ex:
        print(f'Row: {idx} {set_url}')
        print(f'Error: {ex}') 

7000: /sets/6243-1/Brickbeard-s-Bounty
7200: /sets/30008-1/Snowman
7400: /sets/MMMB006-1/House
7600: /sets/8195-1/Turbo-Tow
7800: /sets/852940-1/Princess-Tamina-Key-Chain
8000: /sets/3658-1/Police-Helicopter
8200: /sets/8805-3/Royal-Guard
8400: /sets/2852724-1/Accelerometer-Sensor
Endpoint: https://brickset.com/sets/2853101-1/Gift-Card
Error: 524 Server Error: Origin Time-out for url: https://brickset.com/sets/2853101-1/Gift-Card
Row: 8403 /sets/2853101-1/Gift-Card
Error: 524 Server Error: Origin Time-out for url: https://brickset.com/sets/2853101-1/Gift-Card
Endpoint: https://brickset.com/sets/2853124-1/Gift-Card-Reload
Error: 524 Server Error: Origin Time-out for url: https://brickset.com/sets/2853124-1/Gift-Card-Reload
Row: 8404 /sets/2853124-1/Gift-Card-Reload
Error: 524 Server Error: Origin Time-out for url: https://brickset.com/sets/2853124-1/Gift-Card-Reload
Endpoint: https://brickset.com/sets/2853216-1/Infrared-Link-Sensor
Error: 524 Server Error: Origin Time-out for url: https

In [10]:
set_df = pd.DataFrame(set_ls)
set_df = set_df[['set_no', 'name', 'url', 
                 'theme_group', 'theme', 'subtheme',  
                 'year', 'piece_cnt', 'minifig_cnt', 'inventory_url', 'minifig_url',  
                 'store_price', 'current_price', 'rating_value', 'rating_votes', 
                 'tags', 'set_type', 'packaging', 'notes',]]
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,url,theme_group,theme,subtheme,year,piece_cnt,minifig_cnt,inventory_url,minifig_url,store_price,current_price,rating_value,rating_votes,tags,set_type,packaging,notes
0,6243-1,Brickbeard's Bounty,/sets/6243-1/Brickbeard-s-Bounty,Historical,Pirates,,2009,592,8,/inventories/6243-1,/minifigs/inset-6243-1,"£59.99, $99.99","New: $215, Used: $133",4.5,21,"Captain Brickbeard, Galleon, Monkey, Parrot, R...",Normal,Box,
1,6253-1,Shipwreck Hideout,/sets/6253-1/Shipwreck-Hideout,Historical,Pirates,,2009,310,6,/inventories/6253-1,/minifigs/inset-6253-1,"£26.45, $39.99","New: $98, Used: $63",4.8,19,"Captain Brickbeard, Crab, Criminal Hideout, Fi...",Normal,Box,
2,6299-1,Pirates Advent Calendar,/sets/6299-1/Pirates-Advent-Calendar,Historical,Pirates,Seasonal,2009,148,8,/inventories/6299-1,/minifigs/inset-6299-1,£15.79,"New: $49, Used: $35",5.0,5,"Captain Brickbeard, Advent Calendar, Christmas...",Normal,Box,
3,6741-1,Mini Jet,/sets/6741-1/Mini-Jet,Model making,Creator,,2009,63,0,/inventories/6741-1,,"£4.99, $5.99","New: $8, Used: $3",4.4,8,"Aircraft, Jet Aircraft, Microscale, Multibuild...",Normal,Canister,
4,6742-1,Mini Off-Roader,/sets/6742-1/Mini-Off-Roader,Model making,Creator,,2009,64,0,/inventories/6742-1,,"£4.99, $5.99","New: $9, Used: $4",4.3,6,"Microscale, Multibuild, Off Roader, Telehandle...",Normal,Canister,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3491 entries, 0 to 3490
Data columns (total 19 columns):
set_no           3491 non-null object
name             3491 non-null object
url              3491 non-null object
theme_group      3491 non-null object
theme            3491 non-null object
subtheme         2758 non-null object
year             3491 non-null object
piece_cnt        2570 non-null object
minifig_cnt      3491 non-null object
inventory_url    1922 non-null object
minifig_url      1576 non-null object
store_price      2745 non-null object
current_price    3491 non-null object
rating_value     1975 non-null object
rating_votes     1975 non-null object
tags             3491 non-null object
set_type         3491 non-null object
packaging        2464 non-null object
notes            1572 non-null object
dtypes: object(19)
memory usage: 518.3+ KB


In [11]:
save_path = r'../data/brickset_set_data_7k.csv'
set_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.92 mb'

In [12]:
set_ls = []

for idx, row in url_10k_df.iterrows():
    set_url = row['url']
    set_name = row['name']
    if idx % 200 == 0:
        print(f'{idx}: {set_url}')

    try:
        cset_dx = lh.get_set_data(set_url)
        set_ls.append(cset_dx)
    except Exception as ex:
        print(f'Error: {idx} {set_url}') 

10600: /sets/5004274-1/LEGO-Friends-Storage-Brick-1-Medium-Lilac
10800: /sets/40137-1/Submarine
11000: /sets/70781-1/Protector-of-Earth
11200: /sets/561507-1/Garden-set
11400: /sets/ISBN0241187567-1/LEGO-Ninjago-Build-Your-Own-Adventure
11600: /sets/41065-1/Rapunzel-s-Best-Day-Ever
11800: /sets/71011-7/Faun
12000: /sets/271601-1/Lance
12200: /sets/5005136-1/The-Force-Awakens-PS-4-Video-Game-–-Deluxe-Edition
12400: /sets/31062-1/Robo-Explorer
12600: /sets/70359-1/Lance-vs-Lightning
12800: /sets/76088-1/Thor-vs-Hulk-Arena-Clash
13000: /sets/5005355-1/Red-Blue-Brick-Print-Lunch-Bag
13200: /sets/30360-1/Arctic-Ice-Saw
13400: /sets/42082-1/Rough-Terrain-Crane
13600: /sets/75932-1/Jurassic-Park-Velociraptor-Chase
13800: /sets/5005533-1/Brick-Lunch-Bag-Black


In [13]:
set_df = pd.DataFrame(set_ls)
set_df = set_df[['set_no', 'name', 'url', 
                 'theme_group', 'theme', 'subtheme',  
                 'year', 'piece_cnt', 'minifig_cnt', 'inventory_url', 'minifig_url',  
                 'store_price', 'current_price', 'rating_value', 'rating_votes', 
                 'tags', 'set_type', 'packaging', 'notes',]]
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,url,theme_group,theme,subtheme,year,piece_cnt,minifig_cnt,inventory_url,minifig_url,store_price,current_price,rating_value,rating_votes,tags,set_type,packaging,notes
0,5002912-1,R2 D2 Key Light,/sets/5002912-1/R2-D2-Key-Light,Miscellaneous,Gear,Lights,2014,,0,,,"£9.99, 6.49€","New: Not known, Used: Not known",5.0,1review,,Gear,,
1,5002913-1,Superman Key Light,/sets/5002913-1/Superman-Key-Light,Miscellaneous,Gear,Lights,2014,,0,,,"£9.99, $11.99, 12.99€","New: Not known, Used: Not known",,,,Gear,,
2,5002914-1,THE LEGO MOVIE Emmet Key Light,/sets/5002914-1/THE-LEGO-MOVIE-Emmet-Key-Light,Miscellaneous,Gear,Lights,2014,,0,,,"£9.99, $11.99, 9.09€","New: Not known, Used: Not known",,,,Gear,,
3,5002915-1,Batman Key Light,/sets/5002915-1/Batman-Key-Light,Miscellaneous,Gear,Lights,2014,,0,,,"£9.99, 12.99€","New: Not known, Used: Not known",,,,Gear,,
4,5002916-1,The LEGO Movie Unikitty Key Light,/sets/5002916-1/The-LEGO-Movie-Unikitty-Key-Light,Miscellaneous,Gear,Lights,2014,,0,,,"£9.99, $13.99, 9.09€","New: Not known, Used: Not known",,,,Gear,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3399 entries, 0 to 3398
Data columns (total 19 columns):
set_no           3399 non-null object
name             3399 non-null object
url              3399 non-null object
theme_group      3399 non-null object
theme            3399 non-null object
subtheme         2866 non-null object
year             3399 non-null object
piece_cnt        2355 non-null object
minifig_cnt      3399 non-null object
inventory_url    1744 non-null object
minifig_url      1647 non-null object
store_price      2402 non-null object
current_price    3399 non-null object
rating_value     1509 non-null object
rating_votes     1509 non-null object
tags             3399 non-null object
set_type         3399 non-null object
packaging        2595 non-null object
notes            1033 non-null object
dtypes: object(19)
memory usage: 504.6+ KB


In [14]:
save_path = r'../data/brickset_set_data_10k.csv'
set_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.92 mb'