In [1]:
import numpy as np
import pandas as pd
import io
import bson                       # this is installed with the pymongo package
import matplotlib.pyplot as plt
from scipy.misc import imread, imsave
import multiprocessing as mp      # will come in handy due to the size of the data
import os.path

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [None]:
DATASET_PATH = '/media/rs/0E06CD1706CD0127/Kapok/kaggle/'
FILESTR_PATH = '/home/kapok/kaggle/'

In [None]:
NCORE = 8
IMAGE_COUNT = mp.Value('i', 0)

prod_to_category = mp.Manager().dict() # note the difference

def process(q, iolock, count, dataFold='Train'):
    process = mp.current_process()
    with open(FILESTR_PATH + str(process.pid) + '.txt', 'w') as outfile:
        while True:
            d = q.get()
            if d is None:
                break
            product_id = d['_id']
            category_id = d['category_id']
            prod_to_category[product_id] = category_id
            for e, pic in enumerate(d['imgs']):
                picture = imread(io.BytesIO(pic['picture']))
                # do something with the picture, etc
                with count.get_lock():
                    tempcount = count.value
                    count.value += 1
                outfile.write('id_p{:0>8d}'.format(product_id) + '_' + 'n{:0>9d}'.format(tempcount) + '.jpg\n')
                with iolock:
                    imsave(os.path.join(DATASET_PATH, dataFold, 'id_p{:0>8d}'.format(product_id) + '_' + 'n{:0>9d}'.format(tempcount) + '.jpg'), picture)

q = mp.Queue(maxsize=NCORE)
iolock = mp.Lock()
pool = mp.Pool(NCORE, initializer=process, initargs=(q, iolock, IMAGE_COUNT, 'Train'))

# process the file
if os.path.exists(os.path.join(DATASET_PATH, 'Train')) is not True: os.mkdir(os.path.join(DATASET_PATH, 'Train'))
data = bson.decode_file_iter(open(DATASET_PATH + 'train.bson', 'rb'))
for c, d in enumerate(data):
    q.put(d)  # blocks until q below its max size

# tell workers we're done
for _ in range(NCORE):  
    q.put(None)
pool.close()
pool.join()

# convert back to normal dictionary
prod_to_category = dict(prod_to_category)

prod_to_category = pd.DataFrame.from_dict(prod_to_category, orient='index')
prod_to_category.index.name = '_id'
prod_to_category.rename(columns={0: 'category_id'}, inplace=True)

In [None]:
#prod_to_category.head()
#plt.imshow(picture);