[https://www.kaggle.com/humananalog/keras-generator-for-reading-directly-from-bson](https://www.kaggle.com/humananalog/keras-generator-for-reading-directly-from-bson)

In [1]:
import os, sys, math, io
import numpy as np
import pandas as pd
import multiprocessing as mp
import bson
import struct
import keras

%matplotlib inline
import matplotlib.pyplot as plt

import keras
from keras.preprocessing.image import load_img, img_to_array
import tensorflow as tf

from collections import defaultdict
from tqdm import *

from subprocess import check_output


Using TensorFlow backend.


In [2]:
print(check_output(["ls", "data"]).decode("utf8"))

category_names.7z
category_names.csv
sample_submission.7z
sample_submission.csv
train_example.bson



In [3]:
keras.__version__, tf.__version__

('2.0.8', '1.2.1')

In [4]:
data_dir = "data"

In [5]:
train_bson_path = os.path.join(data_dir, "train_example.bson")
num_train_products = 82

In [6]:
categories_path = os.path.join(data_dir, "category_names.csv")
categories_df = pd.read_csv(categories_path, index_col="category_id", encoding="mac_latin2")

In [7]:
categories_df.head(5)

Unnamed: 0_level_0,category_level1,category_level2,category_level3
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA
1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR
1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO
1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU
1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR


In [8]:
# Maps the category_id to an integer index. This is what we'll use to one-hot encode the labels.
categories_df["category_idx"] = pd.Series(range(len(categories_df)), index=categories_df.index)
categories_df.head(5)

Unnamed: 0_level_0,category_level1,category_level2,category_level3,category_idx
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000021794,ABONNEMENT / SERVICES,CARTE PREPAYEE,CARTE PREPAYEE MULTIMEDIA,0
1000012764,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI FUMEUR,1
1000012776,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,ABRI VELO - ABRI MOTO,2
1000012768,AMENAGEMENT URBAIN - VOIRIE,AMENAGEMENT URBAIN,FONTAINE A EAU,3
1000012755,AMENAGEMENT URBAIN - VOIRIE,SIGNALETIQUE,PANNEAU D'INFORMATION EXTERIEUR,4


In [9]:
def make_category_tables():
    cat2idx = {}
    idx2cat = {}
    for row in categories_df.itertuples():
        category_id = row[0]
        category_idx = row[4]
        cat2idx[category_id] = category_idx
        idx2cat[category_idx] = category_id
    return cat2idx, idx2cat

In [10]:
cat2idx, idx2cat = make_category_tables()

In [11]:
# Test if it works:
cat2idx[1000012755], idx2cat[4]

(4, 1000012755)

In [13]:
# Read the BSON Files
#We store the offsets and lengths of all items, allowing us random access to the items later.

num_dicts = 7069896 # according to data page

length_size = 4 # number of bytes decoding item length

def read_bson(bson_path, num_records, with_categories):
    rows = {}
    with open(bson_path, "rb") as f, tqdm(total=num_records) as pbar:
        offset = 0
        while True:
            item_length_bytes = f.read(length_size)
            if len(item_length_bytes) == 0:
                break

            # Decode item length:
            length = struct.unpack("<i", item_length_bytes)[0]

            f.seek(offset)
            item_data = f.read(length)
            assert len(item_data) == length

            item = bson.BSON.decode(item_data)
            product_id = item["_id"]
            num_imgs = len(item["imgs"])

            row = [num_imgs, offset, length]
            if with_categories:
                row += [item["category_id"]]
            rows[product_id] = row

            offset += length
            f.seek(offset)
            pbar.update()

    columns = ["num_imgs", "offset", "length"]
    if with_categories:
        columns += ["category_id"]

    df = pd.DataFrame.from_dict(rows, orient="index")
    df.index.name = "product_id"
    df.columns = columns
    df.sort_index(inplace=True)
    return df

In [14]:
%time train_offsets_df = read_bson(train_bson_path, num_records=num_train_products, with_categories=True)

100%|██████████| 82/82 [00:00<00:00, 9313.10it/s]

CPU times: user 4.99 ms, sys: 3.3 ms, total: 8.29 ms
Wall time: 13.9 ms





In [15]:
train_offsets_df.head()

Unnamed: 0_level_0,num_imgs,offset,length,category_id
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0,6979,1000010653
1,1,6979,7318,1000010653
2,1,14297,5455,1000004079
3,1,19752,4580,1000004141
4,1,24332,6346,1000015539


In [16]:
train_offsets_df.to_csv("train_offsets.csv")

In [17]:
# How many products?
len(train_offsets_df)

82

In [18]:
# How many categories?
len(train_offsets_df["category_id"].unique())

36

In [19]:
# How many images in total?
train_offsets_df["num_imgs"].sum()

110

In [21]:
#%time test_offsets_df = read_bson(test_bson_path, num_records=num_test_products, with_categories=False)

In [None]:
#test_offsets_df.head()

In [None]:
#test_offsets_df.to_csv("test_offsets.csv")

In [None]:
def make_val_set(df, split_percentage=0.2, drop_percentage=0.):
    # Find the product_ids for each category.
    category_dict = defaultdict(list)
    for ir in tqdm(df.itertuples()):
        category_dict[ir[4]].append(ir[0])

    train_list = []
    val_list = []
    with tqdm(total=len(df)) as pbar:
        for category_id, product_ids in category_dict.items():
            category_idx = cat2idx[category_id]

            # Randomly remove products to make the dataset smaller.
            keep_size = int(len(product_ids) * (1. - drop_percentage))
            if keep_size < len(product_ids):
                product_ids = np.random.choice(product_ids, keep_size, replace=False)

            # Randomly choose the products that become part of the validation set.
            val_size = int(len(product_ids) * split_percentage)
            if val_size > 0:
                val_ids = np.random.choice(product_ids, val_size, replace=False)
            else:
                val_ids = []

            # Create a new row for each image.
            for product_id in product_ids:
                row = [product_id, category_idx]
                for img_idx in range(df.loc[product_id, "num_imgs"]):
                    if product_id in val_ids:
                        val_list.append(row + [img_idx])
                    else:
                        train_list.append(row + [img_idx])
                pbar.update()
                
    columns = ["product_id", "category_idx", "img_idx"]
    train_df = pd.DataFrame(train_list, columns=columns)
    val_df = pd.DataFrame(val_list, columns=columns)   
    return train_df, val_df

In [None]:
train_images_df, val_images_df = make_val_set(train_offsets_df, split_percentage=0.2, drop_percentage=0.9)