In [None]:
import os
import pickle

from PIL import Image
from PIL import ImageOps

from urllib.request import urlretrieve
import zipfile

In [None]:
# Set target path
tpath = os.path.join(os.getcwd(), 'omniglot/')

In [None]:
# Download and extract omniglot
origin_folder = "https://github.com/brendenlake/omniglot/raw/master/python/"

fnames = ["images_evaluation.zip", "images_background.zip"]

for fname in fnames:
    origin = os.path.join(origin_folder, fname)
    if not os.path.isdir('omniglot/'):
        os.makedirs('omniglot/')
    fpath = os.path.join(tpath, fname)
    urlretrieve(origin, fpath)
    zipfile.ZipFile(fpath).extractall(tpath)

In [None]:
# Open all images and collect them in a nested list
def load_glots(path):
    glots = []
    glot_locs = []

    alphabet = [os.path.join(path, x) for x in sorted(os.listdir(path))]
    for alph in alphabet:
        character = [os.path.join(alph, x) for x in sorted(os.listdir(alph))]
        alph_chars = []
        alph_char_locs = []

        for char in character:
            glot_insts = [os.path.join(char, x) for x in sorted(os.listdir(char))]
            glot_instances = []

            for glot_inst in glot_insts:
                tmp_im = Image.open(glot_inst)
                tmp_im = tmp_im.convert('L')
                tmp_im = ImageOps.invert(tmp_im)
                tmp_im = tmp_im.convert('1')

                glot_instances.append(tmp_im)

            alph_chars.append(glot_instances)
            alph_char_locs.append(glot_insts)

        glots.append(alph_chars)
        glot_locs.append(alph_char_locs)
        
    return glots, glot_locs

In [None]:
# Run image opening and collection function
glots_train, glot_locs_train = load_glots(path = os.path.join(tpath, 'images_background/'))
glots_eval, glot_locs_eval = load_glots(path = os.path.join(tpath, 'images_evaluation/'))

In [None]:
#Write dataset to pickle file
if not os.path.exists(tpath):
    os.makedirs(tpath)
#Write train split containing all alphabets from images_background
with open(tpath + 'glots_train.pickle', 'wb') as fp:   
    pickle.dump(glots_train, fp)
with open(tpath + 'glot_locs_train.pickle', 'wb') as fp:   
    pickle.dump(glot_locs_train, fp)
#Write evaluation split containing the first 10 alphabets from images_evaluation
with open(tpath + 'glots_eval.pickle', 'wb') as fp:   
    pickle.dump(glots_eval[:10], fp)
with open(tpath + 'glot_locs_eval.pickle', 'wb') as fp:   
    pickle.dump(glot_locs_eval[:10], fp)
#Write test split containing the remaining 10 alphabets from images_evaluation 
with open(tpath + 'glots_test.pickle', 'wb') as fp:   
    pickle.dump(glots_eval[10:], fp)
with open(tpath + 'glot_locs_test.pickle', 'wb') as fp:   
    pickle.dump(glot_locs_eval[10:], fp)