# Brickset Filter Sets

Workflow: 2

Goal: Start with the full cleaned brickset sets file, and remove sets which are not appropriate for analysis (because it would be comparing apples to oranges). This includes filtering out baby sets (pieces are larger than regular legos), keychains, games, esoteric promotional sets, etc.       

Result: Creates the file ```brickset_set_filter.csv```.

In [1]:
import os
import pandas as pd

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
%load_ext autoreload
%autoreload 2         # reload custom py file eveytime a new cell is run

import sys
sys.path.append('../')
import lego_helper as lh

## Import Data

In [4]:
SET_FILE = '../data/brickset_set_clean.csv'
set_df = pd.read_csv(SET_FILE)
set_df.head()
set_df.info()

Unnamed: 0,set_no,name,theme_group,theme,subtheme,year,volume,weight,piece_cnt,minifig_cnt,inventory_url,minifig_url,price_store,price_new,price_used,rating_value,rating_votes,main_tag,set_type,packaging
0,722-1,"Universal Building Set, 7+",Basic,Basic,,1980.0,,,301.0,0.0,,,20.0,200.0,78.0,4.4,5.0,,Normal,
1,733-1,"Universal Building Set, 7+",Basic,Basic,,1980.0,,,533.0,0.0,,,39.0,566.0,109.0,4.5,2.0,,Normal,
2,744-1,"Universal Building Set with Motor, 7+",Basic,Basic,,1980.0,,,537.0,0.0,,,59.0,252.0,97.0,5.0,2.0,,Normal,
3,820-1,Red Plates Parts Pack,Basic,Basic,Supplementaries,1980.0,,,34.0,0.0,,,4.75,,,,,,Normal,
4,822-1,Blue Plates Parts Pack,Basic,Basic,Supplementaries,1980.0,,,34.0,0.0,,,4.75,29.0,5.0,,,,Normal,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13762 entries, 0 to 13761
Data columns (total 20 columns):
set_no           13762 non-null object
name             13762 non-null object
theme_group      13762 non-null object
theme            13762 non-null object
subtheme         10048 non-null object
year             13762 non-null float64
volume           2925 non-null float64
weight           3086 non-null float64
piece_cnt        10862 non-null float64
minifig_cnt      13762 non-null float64
inventory_url    6961 non-null object
minifig_url      6266 non-null object
price_store      8384 non-null float64
price_new        8532 non-null float64
price_used       7048 non-null float64
rating_value     6825 non-null float64
rating_votes     6825 non-null float64
main_tag         0 non-null float64
set_type         13762 non-null object
packaging        8923 non-null object
dtypes: float64(11), object(9)
memory usage: 2.1+ MB


In [5]:
# remove columns that definitely won't be used downstream

set_df = set_df[['set_no', 'name', 'theme_group', 'theme', 'subtheme', 
                'year', 'volume', 'weight', 'piece_cnt', 'minifig_cnt', 
                'price_store', 'price_new', 'price_used', 'rating_value', 'rating_votes', 
                'main_tag', 'set_type', 'packaging']]
set_df.head()

Unnamed: 0,set_no,name,theme_group,theme,subtheme,year,volume,weight,piece_cnt,minifig_cnt,price_store,price_new,price_used,rating_value,rating_votes,main_tag,set_type,packaging
0,722-1,"Universal Building Set, 7+",Basic,Basic,,1980.0,,,301.0,0.0,20.0,200.0,78.0,4.4,5.0,,Normal,
1,733-1,"Universal Building Set, 7+",Basic,Basic,,1980.0,,,533.0,0.0,39.0,566.0,109.0,4.5,2.0,,Normal,
2,744-1,"Universal Building Set with Motor, 7+",Basic,Basic,,1980.0,,,537.0,0.0,59.0,252.0,97.0,5.0,2.0,,Normal,
3,820-1,Red Plates Parts Pack,Basic,Basic,Supplementaries,1980.0,,,34.0,0.0,4.75,,,,,,Normal,
4,822-1,Blue Plates Parts Pack,Basic,Basic,Supplementaries,1980.0,,,34.0,0.0,4.75,29.0,5.0,,,,Normal,


## Filter based on different columns

In [6]:
# filter by group-theme to get rid of baby sets, etc

remove_group_ls = ['Basic', 'Educational', 'Miscellaneous', 'Other', 'Pre-school', 'Vintage themes', ]
set_df = set_df.loc[set_df['theme_group'].isin(remove_group_ls)==False]
set_df.shape

(7118, 18)

In [7]:
# also remove the groups that have different piece size ratio (not minifig scale)

# remove_group_ls = ['Constraction', 'Girls', 'Junior', 'Technical']
# set_df = set_df.loc[set_df['theme_group'].isin(remove_group_ls)==False]
# set_df.shape

In [8]:
# remove sets that aren't minifig scale

#set_df = set_df.loc[set_df['minifig_cnt']>=1]
#set_df.shape

# this cuts out 1700 sets
# it's actually ok to keep them, the piece-size ratio is comparable with minifig-scale (unlike technic, basic, etc)

In [9]:
# check the remaining groups

set_df['theme_group'].value_counts()

Licensed            1605
Modern day          1498
Action/Adventure    1068
Girls                539
Model making         513
Constraction         501
Technical            468
Historical           466
Racing               247
Junior               213
Name: theme_group, dtype: int64

In [10]:
# keep only the normal set types (collections are redundant with included sets)
# gets rid of gear (keychains, games, etc)

set_df = set_df.loc[set_df['set_type']=='Normal']
set_df.shape

(6181, 18)

In [11]:
# keep only the sets that come in box 

# set_df = set_df.loc[ (set_df['packaging']=='Box') | (set_df['packaging'].isna()) ]
# set_df.shape

In [12]:
# remove columns that won't be used for feature engineering

set_df = set_df[['set_no', 'name', 
                 'price_store', 'price_new', 'price_used', 'rating_value', 'rating_votes', 
                 'theme_group', 'theme', 'subtheme', 'main_tag', 
                 'year', 'volume', 'weight', 'piece_cnt', 'minifig_cnt', ]]
set_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6181 entries, 18 to 13613
Data columns (total 16 columns):
set_no          6181 non-null object
name            6181 non-null object
price_store     4767 non-null float64
price_new       5780 non-null float64
price_used      5489 non-null float64
rating_value    5005 non-null float64
rating_votes    5005 non-null float64
theme_group     6181 non-null object
theme           6181 non-null object
subtheme        4494 non-null object
main_tag        0 non-null float64
year            6181 non-null float64
volume          2418 non-null float64
weight          2453 non-null float64
piece_cnt       6137 non-null float64
minifig_cnt     6181 non-null float64
dtypes: float64(11), object(5)
memory usage: 820.9+ KB


## Export to csv

In [13]:
save_path = r'../data/brickset_set_filter.csv'
set_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.61 mb'