# Brickset Sets Data

Workflow: 3   

Goal: Loop over the sets' url data and get all the data fields.    

Result: The set urls are split into 4 jobs, the result of each job is output as a csv.

In [1]:
import os
import pandas as pd

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')
import lego_helper as lh

## Scrape One Set

In [4]:
# get data from a new set

TEST_URL = '/sets/4758-1/Hogwarts-Express' 
set1_dx = lh.get_set_data(TEST_URL)
set1_dx

{'set_no': '4758-1',
 'name': 'Hogwarts Express',
 'url': '/sets/4758-1/Hogwarts-Express',
 'set_type': 'Normal',
 'theme_group': 'Licensed',
 'theme': 'Harry Potter',
 'subtheme': 'Prisoner of Azkaban',
 'year': '2004',
 'tags': 'Dementor, Harry Potter, Professor Remus Lupin, Ron Weasley, Lamppost, Magic, Owl',
 'piece_cnt': '389',
 'minifig_cnt': '4',
 'inventory_url': '/inventories/4758-1',
 'minifig_url': '/minifigs/inset-4758-1',
 'store_price': '£34.99, $40.00',
 'current_price': 'New: $124, Used: $96',
 'packaging': 'Box',
 'dimensions': '33.6 x 28.8 x 7.2 cm (13.2 x 11.3 x 2.8 in)',
 'weight': '0.79Kg (1.74 lb)',
 'notes': None,
 'rating_value': '4.3',
 'rating_votes': '4'}

In [5]:
# get data from a test set

TEST_URL = '/sets/722-1/Universal-Building-Set-7'  #'/sets/KC028-1/C-3PO'
set2_dx = lh.get_set_data(TEST_URL)
set2_dx

{'set_no': '722-1',
 'name': 'Universal Building Set, 7+',
 'url': '/sets/722-1/Universal-Building-Set-7',
 'set_type': 'Normal',
 'theme_group': 'Basic',
 'theme': 'Basic',
 'subtheme': None,
 'year': '1980',
 'tags': 'Aircraft, Helicopter, Truck, Car, Boat, Crane, Trailer, 4X4',
 'piece_cnt': '301',
 'minifig_cnt': 0,
 'inventory_url': None,
 'minifig_url': None,
 'store_price': '$20.00',
 'current_price': 'New: $200, Used: $78',
 'packaging': None,
 'dimensions': None,
 'weight': None,
 'notes': None,
 'rating_value': '4.4',
 'rating_votes': '5'}

In [6]:
set_ls = [set1_dx, set2_dx]
set_df = pd.DataFrame(set_ls)
set_df = set_df[['set_no', 'name',  
                 'theme_group', 'theme', 'subtheme',  
                 'year', 'dimensions', 'weight',  
                 'piece_cnt', 'minifig_cnt', 'inventory_url', 'minifig_url',  
                 'store_price', 'current_price', 'rating_value', 'rating_votes', 
                 'tags', 'set_type', 'packaging', 'notes',]]
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,theme_group,theme,subtheme,year,dimensions,weight,piece_cnt,minifig_cnt,inventory_url,minifig_url,store_price,current_price,rating_value,rating_votes,tags,set_type,packaging,notes
0,4758-1,Hogwarts Express,Licensed,Harry Potter,Prisoner of Azkaban,2004,33.6 x 28.8 x 7.2 cm (13.2 x 11.3 x 2.8 in),0.79Kg (1.74 lb),389,4,/inventories/4758-1,/minifigs/inset-4758-1,"£34.99, $40.00","New: $124, Used: $96",4.3,4,"Dementor, Harry Potter, Professor Remus Lupin,...",Normal,Box,
1,722-1,"Universal Building Set, 7+",Basic,Basic,,1980,,,301,0,,,$20.00,"New: $200, Used: $78",4.4,5,"Aircraft, Helicopter, Truck, Car, Boat, Crane,...",Normal,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 20 columns):
set_no           2 non-null object
name             2 non-null object
theme_group      2 non-null object
theme            2 non-null object
subtheme         1 non-null object
year             2 non-null object
dimensions       1 non-null object
weight           1 non-null object
piece_cnt        2 non-null object
minifig_cnt      2 non-null object
inventory_url    1 non-null object
minifig_url      1 non-null object
store_price      2 non-null object
current_price    2 non-null object
rating_value     2 non-null object
rating_votes     2 non-null object
tags             2 non-null object
set_type         2 non-null object
packaging        1 non-null object
notes            0 non-null object
dtypes: object(20)
memory usage: 400.0+ bytes


## Scrape All Sets

In [7]:
URL_FILE = '../data/brickset_set_url.csv'
url_df = pd.read_csv(URL_FILE)
url_df.head()
url_df.info()

Unnamed: 0,set_no,name,url
0,722,"Universal Building Set, 7+",/sets/722-1/Universal-Building-Set-7
1,733,"Universal Building Set, 7+",/sets/733-1/Universal-Building-Set-7
2,744,"Universal Building Set with Motor, 7+",/sets/744-1/Universal-Building-Set-with-Motor-7
3,820,Red Plates Parts Pack,/sets/820-1/Red-Plates-Parts-Pack
4,822,Blue Plates Parts Pack,/sets/822-1/Blue-Plates-Parts-Pack


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13899 entries, 0 to 13898
Data columns (total 3 columns):
set_no    13899 non-null object
name      13897 non-null object
url       13899 non-null object
dtypes: object(3)
memory usage: 325.8+ KB


In [8]:
# divide it into jobs for better error handling

url_0k_df = url_df[0:3499]
url_3k_df = url_df[3500:6999]
url_7k_df = url_df[7000:10499]
url_10k_df = url_df[10500:]

url_0k_df.shape
url_3k_df.shape
url_7k_df.shape
url_10k_df.shape

(3499, 3)

(3499, 3)

(3499, 3)

(3399, 3)

In [9]:
# setup function to handle each job

def handle_job(url_df):
    
    set_ls = []
    
    for idx, row in url_df.iterrows():
        set_url = row['url']
        set_name = row['name']
        if idx % 500 == 0:
            print(f'{idx}: {set_url}')

        try:
            cset_dx = lh.get_set_data(set_url)
            set_ls.append(cset_dx)
        except Exception as ex:
            print(f'Row: {idx} {set_url}')
            print(f'Error: {ex}') 
            
    return set_ls

def make_job_df(set_ls):
    set_df = pd.DataFrame(set_ls)
    set_df = set_df[['set_no', 'name',  
                 'theme_group', 'theme', 'subtheme',  
                 'year', 'dimensions', 'weight',  
                 'piece_cnt', 'minifig_cnt', 'inventory_url', 'minifig_url',  
                 'store_price', 'current_price', 'rating_value', 'rating_votes', 
                 'tags', 'set_type', 'packaging', 'notes',]]
    return set_df

In [10]:
# run 1st job 

set_ls = handle_job(url_0k_df)

0: /sets/722-1/Universal-Building-Set-7
500: /sets/2647-1/Farm-Animals
1000: /sets/1649-1/Sea-Skimmer
1500: /sets/1597-1/Castle-Value-Pack
2000: /sets/1856-1/Water-Park-Tub
2500: /sets/2964-1/Space-Spider
3000: /sets/5313-1/Town-Space-Accessories


In [11]:
set_df = make_job_df(set_ls)
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,theme_group,theme,subtheme,year,dimensions,weight,piece_cnt,minifig_cnt,inventory_url,minifig_url,store_price,current_price,rating_value,rating_votes,tags,set_type,packaging,notes
0,722-1,"Universal Building Set, 7+",Basic,Basic,,1980,,,301,0,,,$20.00,"New: $200, Used: $78",4.4,5.0,"Aircraft, Helicopter, Truck, Car, Boat, Crane,...",Normal,,
1,733-1,"Universal Building Set, 7+",Basic,Basic,,1980,,,533,0,,,$39.00,"New: $566, Used: $109",4.5,2.0,"Helicopter, Truck, Baseplate, Crane, Racing, Base",Normal,,
2,744-1,"Universal Building Set with Motor, 7+",Basic,Basic,,1980,,,537,0,,,$59.00,"New: $252, Used: $97",5.0,2.0,"Truck, Crane, Windmill",Normal,,
3,820-1,Red Plates Parts Pack,Basic,Basic,Supplementaries,1980,,,34,0,,,$4.75,"New: Not known, Used: Not known",,,,Normal,,
4,822-1,Blue Plates Parts Pack,Basic,Basic,Supplementaries,1980,,,34,0,,,$4.75,"New: $29, Used: $5",,,,Normal,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3499 entries, 0 to 3498
Data columns (total 20 columns):
set_no           3499 non-null object
name             3499 non-null object
theme_group      3499 non-null object
theme            3499 non-null object
subtheme         1881 non-null object
year             3499 non-null object
dimensions       14 non-null object
weight           14 non-null object
piece_cnt        3334 non-null object
minifig_cnt      3499 non-null object
inventory_url    1272 non-null object
minifig_url      1911 non-null object
store_price      1663 non-null object
current_price    3499 non-null object
rating_value     1663 non-null object
rating_votes     1663 non-null object
tags             3499 non-null object
set_type         3499 non-null object
packaging        2328 non-null object
notes            1216 non-null object
dtypes: object(20)
memory usage: 546.8+ KB


In [12]:
save_path = r'../data/brickset_job_0k.csv'
set_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.57 mb'

In [13]:
# get 2nd job

set_ls = handle_job(url_3k_df)

3500: /sets/9787-1/FIRST-LEGO-League-(FLL)-Challenge-2000-Expansion-Set
4000: /sets/3806-1/Gigamesh-G60
4500: /sets/4484-1/X-Wing-Fighter-TIE-Advanced
5000: /sets/8603-1/Whenua
5500: /sets/65783-1/Bonus-Value-Pack
6000: /sets/EL895-1/Sword-of-Lord-Vladek
6500: /sets/XB3076-1/LEGO-Star-Wars-The-Complete-Saga


In [14]:
set_df = make_job_df(set_ls)
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,theme_group,theme,subtheme,year,dimensions,weight,piece_cnt,minifig_cnt,inventory_url,minifig_url,store_price,current_price,rating_value,rating_votes,tags,set_type,packaging,notes
0,9787-1,FIRST LEGO League (FLL) Challenge 2000 - Expan...,Educational,Education,FIRST LEGO League,2000,,,,0,,,,"New: Not known, Used: Not known",,,,Normal,Box,
1,9872-1,FIRST LEGO League (FLL) Challenge 2000 - Volca...,Educational,Education,FIRST LEGO League,2000,,,174.0,0,,,,"New: Not known, Used: Not known",,,,Normal,Box,
2,9917-1,DCP Sensor Connector Cable,Educational,Education,Mindstorms,2000,,,1.0,0,,,$59.00,"New: $3, Used: Not known",,,Electric,Normal,,Connector cable to link DCP sensors to the RCX...
3,926097-1,Lunchbox Blue,Miscellaneous,Gear,Housewares,2000,,,,0,,,£4.99,"New: Not known, Used: Not known",,,,Gear,,
4,B001-1,"1x4x5 Black Window Frames, Transparent Blue Panes",Basic,Bulk Bricks,,2000,,,20.0,0,,,,"New: $20, Used: Not known",,,Polybag,Other,Polybag,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3499 entries, 0 to 3498
Data columns (total 20 columns):
set_no           3498 non-null object
name             3498 non-null object
theme_group      3498 non-null object
theme            3498 non-null object
subtheme         2656 non-null object
year             3498 non-null object
dimensions       530 non-null object
weight           550 non-null object
piece_cnt        2689 non-null object
minifig_cnt      3498 non-null object
inventory_url    2080 non-null object
minifig_url      1161 non-null object
store_price      2470 non-null object
current_price    3498 non-null object
rating_value     1723 non-null object
rating_votes     1723 non-null object
tags             3498 non-null object
set_type         3498 non-null object
packaging        1630 non-null object
notes            1471 non-null object
dtypes: object(20)
memory usage: 546.8+ KB


In [15]:
save_path = r'../data/brickset_job_3k.csv'
set_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.68 mb'

In [16]:
# get 3rd job

set_ls = handle_job(url_7k_df)

7000: /sets/6243-1/Brickbeard-s-Bounty
7500: /sets/7157-1/Thunder
8000: /sets/3658-1/Police-Helicopter
8500: /sets/M8785506-1/Paper
9000: /sets/850457-1/LEGOLAND-Magnet
9500: /sets/66475-1/Super-Pack
10000: /sets/30113-1/Stephanie-s-Bakery-Stand


In [17]:
set_df = make_job_df(set_ls)
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,theme_group,theme,subtheme,year,dimensions,weight,piece_cnt,minifig_cnt,inventory_url,minifig_url,store_price,current_price,rating_value,rating_votes,tags,set_type,packaging,notes
0,6243-1,Brickbeard's Bounty,Historical,Pirates,,2009,58.2 x 37.8 x 8.7 cm (22.9 x 14.9 x 3.4 in),1.6Kg (3.52 lb),592,8,/inventories/6243-1,/minifigs/inset-6243-1,"£59.99, $99.99","New: $217, Used: $132",4.5,21,"Captain Brickbeard, Ship, Shark, Galleon, Monk...",Normal,Box,
1,6253-1,Shipwreck Hideout,Historical,Pirates,,2009,,,310,6,/inventories/6253-1,/minifigs/inset-6253-1,"£26.45, $39.99","New: $69, Used: $63",4.8,19,"Captain Brickbeard, Snake, Fish, Skeletons, Pa...",Normal,Box,
2,6299-1,Pirates Advent Calendar,Historical,Pirates,Seasonal,2009,,,148,8,/inventories/6299-1,/minifigs/inset-6299-1,£15.79,"New: $52, Used: $35",5.0,5,"Captain Brickbeard, Christmas, Fish, Skeletons...",Normal,Box,
3,6741-1,Mini Jet,Model making,Creator,,2009,10.2 x 14.4 x 4.8 cm (4 x 5.6 x 1.9 in),0.11Kg (0.24 lb),63,0,/inventories/6741-1,,"£4.99, $5.99","New: $9, Used: $3",4.4,8,"Aircraft, Microscale, Space, Multibuild, Speed...",Normal,Canister,
4,6742-1,Mini Off-Roader,Model making,Creator,,2009,10.2 x 14.4 x 4.8 cm (4 x 5.6 x 1.9 in),0.11Kg (0.24 lb),64,0,/inventories/6742-1,,"£4.99, $5.99","New: $10, Used: $3",4.3,6,"Microscale, Multibuild, Tractor, Telehandler",Normal,Canister,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3499 entries, 0 to 3498
Data columns (total 20 columns):
set_no           3499 non-null object
name             3499 non-null object
theme_group      3499 non-null object
theme            3499 non-null object
subtheme         2766 non-null object
year             3499 non-null object
dimensions       1195 non-null object
weight           1362 non-null object
piece_cnt        2574 non-null object
minifig_cnt      3499 non-null object
inventory_url    1925 non-null object
minifig_url      1578 non-null object
store_price      2751 non-null object
current_price    3499 non-null object
rating_value     1979 non-null object
rating_votes     1979 non-null object
tags             3499 non-null object
set_type         3499 non-null object
packaging        2468 non-null object
notes            1574 non-null object
dtypes: object(20)
memory usage: 546.8+ KB


In [18]:
save_path = r'../data/brickset_job_7k.csv'
set_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.81 mb'

In [19]:
# get 4th job

set_ls = handle_job(url_10k_df)

10500: /sets/5002912-1/R2-D2-Key-Light
11000: /sets/70781-1/Protector-of-Earth
11500: /sets/21123-1/The-Iron-Golem
12000: /sets/271601-1/Lance
12500: /sets/41322-1/Snow-Resort-Ice-Rink
13000: /sets/5005355-1/Red-Blue-Brick-Print-Lunch-Bag
13500: /sets/71021-5/Firework-Guy


In [20]:
set_df = make_job_df(set_ls)
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,theme_group,theme,subtheme,year,dimensions,weight,piece_cnt,minifig_cnt,inventory_url,minifig_url,store_price,current_price,rating_value,rating_votes,tags,set_type,packaging,notes
0,5002912-1,R2 D2 Key Light,Miscellaneous,Gear,Lights,2014,,,,0,,,"£9.99, 6.49€","New: Not known, Used: Not known",5.0,1review,,Gear,,
1,5002913-1,Superman Key Light,Miscellaneous,Gear,Lights,2014,,,,0,,,"£9.99, $11.99, 12.99€","New: Not known, Used: Not known",,,,Gear,,
2,5002914-1,THE LEGO MOVIE Emmet Key Light,Miscellaneous,Gear,Lights,2014,,,,0,,,"£9.99, $11.99, 9.09€","New: Not known, Used: Not known",,,,Gear,,
3,5002915-1,Batman Key Light,Miscellaneous,Gear,Lights,2014,,,,0,,,"£9.99, 12.99€","New: Not known, Used: Not known",,,,Gear,,
4,5002916-1,The LEGO Movie Unikitty Key Light,Miscellaneous,Gear,Lights,2014,,,,0,,,"£9.99, $13.99, 9.09€","New: Not known, Used: Not known",,,,Gear,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3399 entries, 0 to 3398
Data columns (total 20 columns):
set_no           3399 non-null object
name             3399 non-null object
theme_group      3399 non-null object
theme            3399 non-null object
subtheme         2866 non-null object
year             3399 non-null object
dimensions       1209 non-null object
weight           1179 non-null object
piece_cnt        2355 non-null object
minifig_cnt      3399 non-null object
inventory_url    1744 non-null object
minifig_url      1647 non-null object
store_price      2402 non-null object
current_price    3399 non-null object
rating_value     1517 non-null object
rating_votes     1517 non-null object
tags             3399 non-null object
set_type         3399 non-null object
packaging        2595 non-null object
notes            1033 non-null object
dtypes: object(20)
memory usage: 531.2+ KB


In [21]:
save_path = r'../data/brickset_job_10k.csv'
set_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.78 mb'