In [4]:
#!/usr/bin/python
import os
import sys
import random
from PIL import Image # used for image resizing
import pickle # used for retrieving saved data
import shutil # used for rapid image copying
import collections # used for checking category sizes

In [5]:
# resize the images in a specified path and write them back to a different path

# this is slightly larger than the 224x224 dimensions needed for VGG16 which allows for data augmentation later if required
width = 256
height = 256

in_path = "C:/Users/Justin/Pictures/Lego/thumbnails/"
out_path = "C:/Users/Justin/Pictures/Lego/preprocessed/"
os.chdir(in_path)
contents = os.listdir(in_path)

def resize():
    counter = 0
    for item in contents:
        # check that the file in question is a file not a folder
        if os.path.isfile(in_path+item):
            counter +=1
            im = Image.open(in_path+item)
            imResize = im.resize((width,height), Image.ANTIALIAS)
            imResize.save(out_path + item, 'JPEG', quality=90)
    # tell us how many images we managed to resize
    print (counter)
        
resize()

14


In [6]:
# Getting back the metadata for the images
path = "C:/Users/Justin/Pictures/Lego/"
os.chdir(path)
with open('LegoDataClean.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    imported_dataset = pickle.load(f)

# check data is what we expect
print (len(imported_dataset))
print (imported_dataset[30:35])

11677
[('4793-1', 'Ogel Shark Sub', 'Alpha', 'https://images.brickset.com/sets/small/4793-1.jpg?200110291200', '2002', ['Ogel Minion Commander', 'Ogel Minion', 'Submarine', 'Underwater']), ('4794-1', 'Alpha Team Command Sub', 'Alpha', 'https://images.brickset.com/sets/small/4794-1.jpg?200112121200', '2002', ['Crunch', 'Radia', 'Mobile Base', 'Submarine', 'Underwater']), ('4795-1', 'Ogel Underwater Base and AT Sub', 'Alpha', 'https://images.brickset.com/sets/small/4795-1.jpg?200110301200', '2002', ['Dash Justice', 'Ogel Minion', 'Ogel', 'Base', 'Criminal Hideout', 'Submarine', 'Underwater']), ('4796-1', 'Ogel Mutant Squid', 'Alpha', 'https://images.brickset.com/sets/small/4796-1.jpg?200110301200', '2002', ['Ogel', 'Squid', 'Underwater']), ('4797-1', 'Ogel Mutant Killer Whale', 'Alpha', 'https://images.brickset.com/sets/small/4797-1.jpg?200110301200', '2002', ['Ogel Minion', 'Underwater'])]


In [7]:
#some of the classes may prove to be very small, we need to  find the large sets with > 100 images

# create a dictionary of categories and number of members
categories_extracted = [x[2] for x in imported_dataset]
counted_categories =collections.Counter(categories_extracted)
print (counted_categories)

Counter({'Gear': 1904, 'Duplo': 777, 'Star': 611, 'Collectable': 520, 'City': 517, 'Creator': 368, 'Promotional': 359, 'Bionicle': 336, 'Friends': 295, 'Books': 264, 'Ninjago': 253, 'Town': 252, 'Racers': 242, 'Technic': 217, 'Samsonite': 186, 'Education': 183, 'The': 163, 'Castle': 159, 'System': 158, 'LEGOLAND': 154, 'Bulk': 141, 'Legends': 138, 'Seasonal': 133, 'Miscellaneous': 126, 'Space': 125, 'Sports': 123, 'Explore': 119, 'Trains': 113, 'Fabuland': 107, 'HERO': 104, 'Marvel': 102, 'Nexo': 100, 'DC': 100, 'Mixels': 90, 'Dacta': 82, 'Belville': 77, 'Clikits': 76, 'Dimensions': 68, 'Juniors': 67, 'Bricks': 64, 'Mindstorms': 63, 'Scala': 61, 'BrickHeadz': 60, 'Advanced': 59, 'Harry': 56, 'Studios': 55, 'Classic': 50, 'Freestyle': 49, 'World': 47, 'Games': 46, 'Architecture': 44, 'Make': 43, 'Universal': 42, 'Adventurers': 42, 'Disney': 42, 'Elves': 41, 'Minecraft': 40, 'Baby': 40, 'Exo-Force': 39, 'Pirates': 39, 'Power': 37, 'Alpha': 32, 'Homemaker': 32, 'Cars': 28, 'Speed': 28, 'B

In [8]:
# shuffle up our examples to ensure that train, validation and test sets have similar distributions
random.shuffle(imported_dataset)

# then sort by set type
set_type_dataset = sorted(imported_dataset, key=lambda x: x[2])

print (set_type_dataset[30:35])

[('2947-1', 'Speedbike', 'Action', 'https://images.brickset.com/sets/small/2947-1.jpg?200012231200', '2001', ['Pullback Motor']), ('2913-1', 'Construction', 'Action', 'https://images.brickset.com/sets/small/2913-1.jpg?200012011200', '2000', ['Construction', 'Excavator', 'Front Loader', 'Tracked Vehicle']), ('2904-1', 'Motorbike', 'Action', 'https://images.brickset.com/sets/small/2904-1.jpg?200012011200', '2000', ['Motorcycle', 'Sidecar']), ('2916-1', 'MyBot', 'Action', 'https://images.brickset.com/sets/small/2916-1.jpg?200012011200', '2000', ['']), ('10184-1', 'Town Plan', 'Advanced', 'https://images.brickset.com/sets/small/10184-1.jpg?200712220504', '2008', ['Anniversary Set', 'Art Deco', 'Baseplate', 'Brick Built Tree', 'Car Wash', 'Cinema', 'Lamppost', 'Octan', 'Service Station', 'Tanker Vehicle', 'Wedding'])]


In [9]:
# create a new training set list which only contains items in a subsetted list
limited_list = ['Duplo', 'Star', 'City', 'Creator', 'Bionicle', 'Ninjago', 'Town',
                'Racers', 'Technic', 'Castle', 'System', 'LEGOLAND', 'Space', 'Sports', 
                'Explore', 'Trains', 'Fabuland', 'HERO', 'Marvel', 'DC']

In [10]:
#create a new list which only contains entries from our chosen categories
limited_dataset = []

for row in set_type_dataset:
    if row[2] in limited_list:
        limited_dataset += [row]

# check our new dataset looks plausible
print (limited_dataset[30:35])
print (len(limited_dataset))

[('1419-1', 'Nokama', 'Bionicle', 'https://images.brickset.com/sets/small/1419-1.jpg?200111171200', '2001', ['Female', 'Kabaya', 'Mata Nui (Location)', 'Polybag', 'Water']), ('7217-1', 'Duracell Bad Guy', 'Bionicle', 'https://images.brickset.com/sets/small/7217-1.jpg?201806180838', '2006', ['']), ('8926-1', 'Toa Undersea Attack', 'Bionicle', 'https://images.brickset.com/sets/small/8926-1.jpg?200707010609', '2007', ['Mahri Nui', 'The Pit']), ('70793-1', 'Skull Basher', 'Bionicle', 'https://images.brickset.com/sets/small/70793-1.jpg?201506271117', '2015', ['Bionicle Villains', 'Bull Skull Mask', 'Ccbs', 'Earth', 'Golden Mask Of Power', 'Mask Of Earth', 'Okoto', 'Skull Army']), ('8694-1', 'Krika', 'Bionicle', 'https://images.brickset.com/sets/small/8694-1.jpg?200805251204', '2008', ['Bionicle Villains', 'Brotherhood Of Makuta', 'Karda Nui', 'Makuta'])]
4937


In [11]:
# set up our keras folders

# specify base directories
source_directory = "C:/Users/Justin/Pictures/Lego/preprocessed/"
train_directory = "C:/Users/Justin/Pictures/Lego/data/train/"
validation_directory = "C:/Users/Justin/Pictures/Lego/data/validation/"
test_directory = "C:/Users/Justin/Pictures/Lego/data/test/"

# need to extract the categories to set up appropriate folders in train, test and validate
categories = []
for row in limited_dataset:
    categories += [row[2]]

categories = list(set(categories))
# check our list is correct
print (categories)

for category in categories:
    os.makedirs(os.path.dirname(train_directory+category+"/"), exist_ok=True)
    os.makedirs(os.path.dirname(test_directory+category+"/"), exist_ok=True)
    os.makedirs(os.path.dirname(validation_directory+category+"/"), exist_ok=True)

['Racers', 'Star', 'Castle', 'Trains', 'Marvel', 'Creator', 'Technic', 'Fabuland', 'Duplo', 'Space', 'Sports', 'Town', 'City', 'System', 'LEGOLAND', 'DC', 'Bionicle', 'Explore', 'HERO', 'Ninjago']


In [12]:
# load the various classes of images and place them in train validation and test folders in an 8:1:1 ratio
import shutil 
row_number = 0

def assign_image(image_name, source, target):
    shutil.copyfile(source + image_name, target + image_name)
    #image = Image.open(source_directory+image_name)
    #image.save(target_directory + image_name, 'JPEG', quality=90)

# cycle over dataset rows
for row in limited_dataset:
    # assign 1/10 to validations, 1/10 to test and the rest to train
    if row_number % 10 == 0:
        # note generators for filename AND also for destination directory
        assign_image(row[0] + "_" + row[2] + "_" + row[4] + '.jpg', source_directory, validation_directory+row[2]+"/")
    elif (row_number - 1) % 10 == 0:
        assign_image(row[0] + "_" + row[2] + "_" + row[4] + '.jpg', source_directory, test_directory+row[2]+"/")
    else:
        assign_image(row[0] + "_" + row[2] + "_" + row[4] + '.jpg', source_directory, train_directory+row[2]+"/")
    row_number += 1

In [13]:
# finally lets pickle the subset we intend to use in case it comes in handy
import pickle

path = "C:/Users/Justin/Pictures/Lego/"
os.chdir(path)

with open('LegoDataTop20Categories.pkl', 'wb') as file:  # Python 3: open(..., 'wb')
    pickle.dump(limited_dataset, file)