# Brickset Merge Job Files 

Workflow: 4   

Goal: To concatenate the job files with raw data.   

Result: All sets' raw data is available in one file ```brickset_set_full.csv```.

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## Merge Job Files

In [3]:
SET_0_FILE = '../data/brickset_set_data_0k.csv'
SET_3_FILE = '../data/brickset_set_data_3k.csv'
SET_7_FILE = '../data/brickset_set_data_7k.csv'
SET_10_FILE = '../data/brickset_set_data_10k.csv'

In [4]:
set_0_df = pd.read_csv(SET_0_FILE)
set_3_df = pd.read_csv(SET_3_FILE)
set_7_df = pd.read_csv(SET_7_FILE)
set_10_df = pd.read_csv(SET_10_FILE)

set_0_df.shape
set_3_df.shape
set_7_df.shape
set_10_df.shape

(3499, 19)

(3499, 19)

(3491, 19)

(3399, 19)

In [5]:
set_df = pd.concat([set_0_df, set_3_df, set_7_df, set_10_df], ignore_index=True)
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,url,theme_group,theme,subtheme,year,piece_cnt,minifig_cnt,inventory_url,minifig_url,store_price,current_price,rating_value,rating_votes,tags,set_type,packaging,notes
0,722-1,"Universal Building Set, 7+",/sets/722-1/Universal-Building-Set-7,Basic,Basic,,1980.0,301.0,0.0,,,$20.00,"New: $200, Used: $78",4.4,5.0,"4X4, Aircraft, Articulated Lorry, Boat, Car, C...",Normal,,
1,733-1,"Universal Building Set, 7+",/sets/733-1/Universal-Building-Set-7,Basic,Basic,,1980.0,533.0,0.0,,,$39.00,"New: $566, Used: $109",4.5,2.0,"Articulated Lorry, Base, Baseplate, Crane, For...",Normal,,
2,744-1,"Universal Building Set with Motor, 7+",/sets/744-1/Universal-Building-Set-with-Motor-7,Basic,Basic,,1980.0,537.0,0.0,,,$59.00,"New: $252, Used: $97",5.0,2.0,"8 Wheel Vehicle, Crane, Electric Motor, Mobile...",Normal,,
3,820-1,Red Plates Parts Pack,/sets/820-1/Red-Plates-Parts-Pack,Basic,Basic,Supplementaries,1980.0,34.0,0.0,,,$4.75,"New: Not known, Used: Not known",,,,Normal,,
4,822-1,Blue Plates Parts Pack,/sets/822-1/Blue-Plates-Parts-Pack,Basic,Basic,Supplementaries,1980.0,34.0,0.0,,,$4.75,"New: $29, Used: $5",,,,Normal,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13888 entries, 0 to 13887
Data columns (total 19 columns):
set_no           13887 non-null object
name             13887 non-null object
url              13887 non-null object
theme_group      13887 non-null object
theme            13887 non-null object
subtheme         10161 non-null object
year             13887 non-null float64
piece_cnt        10948 non-null float64
minifig_cnt      13887 non-null float64
inventory_url    7018 non-null object
minifig_url      6295 non-null object
store_price      9280 non-null object
current_price    13887 non-null object
rating_value     6870 non-null float64
rating_votes     6870 non-null object
tags             10358 non-null object
set_type         13886 non-null object
packaging        9017 non-null object
notes            5288 non-null object
dtypes: float64(4), object(15)
memory usage: 2.0+ MB


In [6]:
# drop the sets that errored out from url search

set_df = set_df.dropna(subset=['set_no']).reset_index().drop(columns=['index'])
set_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13887 entries, 0 to 13886
Data columns (total 19 columns):
set_no           13887 non-null object
name             13887 non-null object
url              13887 non-null object
theme_group      13887 non-null object
theme            13887 non-null object
subtheme         10161 non-null object
year             13887 non-null float64
piece_cnt        10948 non-null float64
minifig_cnt      13887 non-null float64
inventory_url    7018 non-null object
minifig_url      6295 non-null object
store_price      9280 non-null object
current_price    13887 non-null object
rating_value     6870 non-null float64
rating_votes     6870 non-null object
tags             10358 non-null object
set_type         13886 non-null object
packaging        9017 non-null object
notes            5288 non-null object
dtypes: float64(4), object(15)
memory usage: 2.0+ MB


In [7]:
# remove duplicates

doop_flag = set_df.duplicated(subset=['set_no'], keep=False)
doop_df = set_df.loc[ doop_flag ].sort_values('set_no')
doop_df.head(4)
doop_df.shape

Unnamed: 0,set_no,name,url,theme_group,theme,subtheme,year,piece_cnt,minifig_cnt,inventory_url,minifig_url,store_price,current_price,rating_value,rating_votes,tags,set_type,packaging,notes
7179,10287-1,Intelligent NXT Brick (Black),/sets/10287-1/Intelligent-NXT-Brick-(Black),Technical,Mindstorms,NXT,2009.0,1.0,0.0,,,"£107.65, $169.99","New: $99, Used: $230",,,,Normal,,
7084,10287-1,Intelligent NXT Brick (Black),/sets/10287-1/Intelligent-NXT-Brick-(Black),Technical,Mindstorms,NXT,2009.0,1.0,0.0,,,"£107.65, $169.99","New: $99, Used: $230",,,,Normal,,
7180,20007-1,Republic Attack Cruiser,/sets/20007-1/Republic-Attack-Cruiser,Licensed,Star Wars,The Clone Wars,2009.0,84.0,0.0,/inventories/20007-1,,,"New: $30, Used: $13",4.5,6.0,"Brickmaster, Cartoon, Cartoon Network, Galacti...",Normal,Polybag,[US] BrickMaster exclusive.
7085,20007-1,Republic Attack Cruiser,/sets/20007-1/Republic-Attack-Cruiser,Licensed,Star Wars,The Clone Wars,2009.0,84.0,0.0,/inventories/20007-1,,,"New: $30, Used: $13",4.5,6.0,"Brickmaster, Cartoon, Cartoon Network, Galacti...",Normal,Polybag,[US] BrickMaster exclusive.


(266, 19)

In [8]:
set_df = set_df.drop_duplicates(subset=['set_no']).reset_index()
set_df = set_df.drop(columns=['index', 'url'], errors='ignore')
set_df.shape

(13754, 18)

## Export to csv

In [9]:
save_path = r'../data/brickset_set_full.csv'
set_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'2.94 mb'