This notebook is used in Google Colab for accessing free GPU in order to detect the number of product in given grocery shelves image.

The dataset is provided by Varol Gul and Kuzu, Ridvan S. for the title "Toward Retail Product Recognition on Grocery Shelves" in 2014. It contains 354 tobacco shelves images collected from ~40 locations with 4 cameras.

Dataset can be download for [here](https://github.com/gulvarol/grocerydataset)

Notebook contains:


1.   Installation of Tensorflow object detection API in Google Colab
2.   Data Preparation
3.   Training
4.   Inference





In [0]:
%tensorflow_version 1.x  ### we will be using Tensorflow 1.x because Tensorflow object detection API does not support training in Tensorflow 2.x

In [0]:
!git clone --quiet https://github.com/tensorflow/models.git ## Git command to download the Tensorflow Onject detection Model

## Prerequisite for the Tensorflow Onject detection Model
!apt-get install -qq protobuf-compiler python-tk  
!pip install -q Cython contextlib2 pillow lxml matplotlib PyDrive
!pip install -q pycocotools

In [0]:
%%bash
cd models/research/
protoc object_detection/protos/*.proto --python_out=.  ## Bash command to create .py files in proto folder.

In [0]:
## Testing the Tensorflow Object Detection model, if it occurs any error there is something wrong.
import os
os.environ['PYTHONPATH'] += ':/content/models/research/:/content/models/research/slim/'

!python /content/models/research/object_detection/builders/model_builder_test.py

In [0]:
%%bash 
cd models/research
pip install .  ## installing the object-detection module

In [0]:
##importing libraries
from object_detection.utils import ops as utils_ops
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util
from object_detection.utils import dataset_util

In [0]:
###Downloading the Dataset
%cd /content
! wget https://github.com/gulvarol/grocerydataset/releases/download/1.0/GroceryDataset_part1.tar.gz
! wget https://github.com/gulvarol/grocerydataset/releases/download/1.0/GroceryDataset_part2.tar.gz

In [0]:
##Creating folder to store downloaded datasets
! mkdir 'data'
! mkdir 'data/images'

In [0]:
##Extracting the downloaded daaset in "data/images" folder.
!tar -xvf  'GroceryDataset_part1.tar.gz'  -C 'data/images'
!tar -xvf  'GroceryDataset_part2.tar.gz'  -C 'data/images'

In [0]:
## importing libraries
import cv2
import pandas as pd
from matplotlib import pyplot as plt
import os
import io
import numpy as np
from sklearn.model_selection import train_test_split
from PIL import Image
import tensorflow as tf
from collections import namedtuple, OrderedDict
%matplotlib inline

Data Preparation


In [0]:
##Path
data_path = 'data/'
shelf_images = 'data/images/ShelfImages/'
shelf_images_train = 'data/images/ShelfImages1/ShelfImages/train'
shelf_images_test = 'data/images/ShelfImages1/ShelfImages/test'
product_images = 'data/images/ProductImagesFromShelves/'
cropped_path_train = '/content/data/images/ShelfImages1/train'
cropped_path_test = '/content/data/images/ShelfImages1/eval'
cropped_path = 'data/cropped/'

In [0]:
## Creating dataframe of all the shelf images
jpg_files = [f for f in os.listdir(f'{shelf_images}') if f.endswith('JPG')]
photos_df = pd.DataFrame([[f, f[:6], f[7:14]] for f in jpg_files], 
                         columns=['file', 'shelf_id', 'planogram_id'])
photos_df.head()

In [0]:
## Creating dataframe of all the product images
products_df = pd.DataFrame(
    [[f[:18], f[:6], f[7:14], i, *map(int, f[19:-4].split('_'))] 
     for i in range(11) 
     for f in os.listdir(f'{product_images}{i}') if f.endswith('png')],
    columns=['file', 'shelf_id', 'planogram_id', 
             'category', 'xmin', 'ymin', 'w', 'h'])
# convert from width height to xmax, ymax
products_df['xmax'] = products_df['xmin'] + products_df['w']
products_df['ymax'] = products_df['ymin'] + products_df['h']
products_df.head()

In [0]:
# get distinct shelves
shelves = list(set(photos_df['shelf_id'].values))
# use train_test_split from sklearn
shelves_train, shelves_validation, _, _ = train_test_split(
    shelves, shelves, test_size=0.3, random_state=6)
# mark all records in data frames with is_train flag
def is_train(shelf_id): return shelf_id in shelves_train
photos_df['is_train'] = photos_df.shelf_id.apply(is_train)
products_df['is_train'] = products_df.shelf_id.apply(is_train)

In [0]:
df = products_df[products_df.category != 0].\
         groupby(['category', 'is_train'])['category'].\
         count().unstack('is_train').fillna(0)
df.plot(kind='barh', stacked=True)

In [0]:
photos_df.to_pickle(f'{data_path}photos.pkl')
products_df.to_pickle(f'{data_path}products.pkl')

In [0]:
photos = pd.read_pickle(f'{data_path}photos.pkl')
products = pd.read_pickle(f'{data_path}products.pkl')

Cropping 6 random squares from each image and resize crops to 600x600 pictures with same aspect ratio


In [0]:
N_CROP_TRIALS = 6
CROP_SIZE = 1000

In [0]:
# returns random value in [s, f]
def rand_between(s, f):
    if s == f:
        return s
    return np.random.randint(s, f)

In [0]:
train_products, eval_products = [], []
for img_file, is_train in photos[['file', 'is_train']].values:
    img = cv2.imread(f'{shelf_images}{img_file}')
    img_h, img_w, img_c = img.shape
    for n in range(N_CROP_TRIALS):
        # randomly crop square
        c_size = rand_between(300, max(img_h, img_w))
        x0 = rand_between(0, max(0, img_w - c_size))
        y0 = rand_between(0, max(0, img_h - c_size))
        x1 = min(img_w, x0 + c_size)
        y1 = min(img_h, y0 + c_size)
        # products totally inside crop rectangle
        crop_products = products[(products.file == img_file) & 
                                 (products.xmin > x0) & (products.xmax < x1) &
                                 (products.ymin > y0) & (products.ymax < y1)]
        # no products inside crop rectangle? cropping trial failed...
        if len(crop_products) == 0:
            continue
        # name the crop
        crop_img_file = f'{img_file[:-4]}{x0}_{y0}_{x1}_{y1}.JPG'
        # crop and reshape to CROP_SIZExCROP_SIZE or smaller 
        # keeping aspect ratio
        crop = img[y0:y1, x0:x1]
        h, w, c = crop.shape
        ratio = min(CROP_SIZE/h, CROP_SIZE/w)
        crop = cv2.resize(crop, (0,0), fx=ratio, fy=ratio)
        crop = crop[0:CROP_SIZE, 0:CROP_SIZE]
        h, w, c = crop.shape
        # add crop inner products to train_products or eval_products list
        for xmin, ymin, xmax, ymax in \
                crop_products[['xmin', 'ymin', 'xmax', 'ymax']].values:
            xmin -= x0
            xmax -= x0
            ymin -= y0
            ymax -= y0

            xmin, xmax, ymin, ymax = [int(np.round(e * ratio)) 
                                      for e in [xmin, xmax, ymin, ymax]]
            product = {'filename': crop_img_file, 'class':'pack', 
                       'width':w, 'height':h,
                       'xmin':xmin, 'ymin':ymin, 'xmax':xmax, 'ymax':ymax}
            if is_train:
                train_products.append(product)
            else:
                eval_products.append(product)
        # save crop top eval or train folder
        subpath = ['eval/', 'train/'][is_train]
        cv2.imwrite(f'{cropped_path}{subpath}{crop_img_file}', crop)

In [0]:
train_df = pd.DataFrame(train_products).set_index('filename')
eval_df = pd.DataFrame(eval_products).set_index('filename')

In [0]:
print(len(train_df))
print(len(eval_df))

In [0]:
## Creating tf.record for training and testing

def class_text_to_int(row_label):
    if row_label == 'pack':
        return 1
    else:
        None


def split(df, group):
    data = namedtuple('data', ['filename', 'object'])
    gb = df.groupby(group)
    return [data(filename, gb.get_group(x)) 
            for filename, x in zip(gb.groups.keys(), gb.groups)]


def create_tf_example(group, path):
    with tf.gfile.GFile(os.path.join(path, '{}'.format(group.filename)), 'rb') as fid:
        encoded_jpg = fid.read()
    encoded_jpg_io = io.BytesIO(encoded_jpg)
    image = Image.open(encoded_jpg_io)
    width, height = image.size

    filename = group.filename.encode('utf8')
    image_format = b'jpg'
    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    classes_text = []
    classes = []

    for index, row in group.object.iterrows():
        xmins.append(row['xmin'] / width)
        xmaxs.append(row['xmax'] / width)
        ymins.append(row['ymin'] / height)
        ymaxs.append(row['ymax'] / height)
        classes_text.append(row['class'].encode('utf8'))
        classes.append(class_text_to_int(row['class']))

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(filename),
        'image/source_id': dataset_util.bytes_feature(filename),
        'image/encoded': dataset_util.bytes_feature(encoded_jpg),
        'image/format': dataset_util.bytes_feature(image_format),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
    }))
    return tf_example

In [0]:
def convert_to_tf_records(images_path, examples, dst_file):
    writer = tf.python_io.TFRecordWriter(dst_file)
    grouped = split(examples, 'filename')
    for group in grouped:
        tf_example = create_tf_example(group, images_path)
        writer.write(tf_example.SerializeToString())
    writer.close()

In [0]:
convert_to_tf_records(f'{cropped_path}/train/', train_df, f'{data_path}train.record')
convert_to_tf_records(f'{cropped_path}/eval/', eval_df, f'{data_path}test.record')

In [0]:
## Creating training folder and we will put all the files in it. First, we put ssd_mobilenet COCO model in it. Second, we put ssd_mobilenet_v1_coco.config
## Third we put label_map.pbtxt in it. Lastly, We put tf.record of train and test in it.
! mkdir '/content/models/research/object_detection/training'

In [0]:
! wget http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2017_11_17.tar.gz ## download ssd_mobilenet COCO model, you can download whatever model for object detection

In [0]:
## Extracting the model in training folder
!tar -xvf  'ssd_mobilenet_v1_coco_2017_11_17.tar.gz'  -C '/content/models/research/object_detection/training' 

In [0]:
## moving the tf.record of train and test in training folder
!mv /content/data/test.record /content/models/research/object_detection/training
!mv /content/data/train.record /content/models/research/object_detection/training

In [0]:
## coping the ssd_mobilenet_v1_coco.config file to training folder
!cp /content/models/research/object_detection/samples/configs/ssd_mobilenet_v1_coco.config /content/models/research/object_detection/training

In [0]:
# Creating label_map for object detection. See, we have only one label that is "pack", if you have more than one label you have to change this.

l_map = "item {\n id : 1\n name : 'pack'\n}"
myfile = open("label_map.pbtxt","w+")
myfile.write(l_map)
myfile.close()

In [0]:
## movile label_map.pbtxt file to training folder
!mv label_map.pbtxt /content/models/research/object_detection/training

In [0]:
## Changing path, number of classes in ssd_mobilenet_v1_coco.config file. It depend upon which model you are using for detection. I used SSD_MOBILENET COCO
## If you are using other model, you have to change according to the use. 
import re

#filename = '/content/datalab/pretrained_model/pipeline.config'
filename = '/content/models/research/object_detection/training/ssd_mobilenet_v1_coco.config'
with open(filename) as f:
  s = f.read()
with open(filename, 'w') as f:
  s = re.sub('num_classes: 90', 'num_classes: 1',s)
  # s = re.sub('num_examples: 8000', 'num_examples: 71',s)
  s = re.sub('PATH_TO_BE_CONFIGURED/model.ckpt', '/content/models/research/object_detection/training/ssd_mobilenet_v1_coco_2017_11_17/model.ckpt', s)
  s = re.sub('PATH_TO_BE_CONFIGURED/mscoco_train.record-\?\?\?\?\?-of-00100', '/content/models/research/object_detection/training/train.record', s)
  s = re.sub('PATH_TO_BE_CONFIGURED/mscoco_val.record-\?\?\?\?\?-of-00010', '/content/models/research/object_detection/training/test.record', s)
  s = re.sub('PATH_TO_BE_CONFIGURED/mscoco_label_map.pbtxt', '/content/models/research/object_detection/training/label_map.pbtxt', s)
  f.write(s)

In [0]:
## Here start the training, It took 3-4 hours of training the model. we can train upto 20000 steps but in this case we don't need the such step.
## We used 5545 steps for the training purpose.
!python /content/models/research/object_detection/legacy/train.py \
    --logtostderr \
    --train_dir=/content/models/research/object_detection/training/trained \
    --pipeline_config_path=/content/models/research/object_detection/training/ssd_mobilenet_v1_coco.config

In [0]:
## Same for evaluating the model.
!python3 /content/models/research/object_detection/legacy/eval.py \
    --logtostderr \
    --checkpoint_dir=/content/models/research/object_detection/training/trained \
    --pipeline_config_path=/content/models/research/object_detection/training/ssd_mobilenet_v1_coco.config \
    --eval_dir=/content/models/research/object_detection/training/valid

In [0]:
## here we create our model which will detect the object in image. The ouput of this cell create frozen_inference_graph.pb which is our model.
!python /content/models/research/object_detection/export_inference_graph.py \
    --input_type image_tensor \
    --pipeline_config_path /content/models/research/object_detection/training/ssd_mobilenet_v1_coco.config \
    --trained_checkpoint_prefix /content/models/research/object_detection/training/trained/model.ckpt-5545 \
    --output_directory /content/output

In [0]:
## Path
PATH_TO_MODEL = '/content/output/frozen_inference_graph.pb'
PATH_TO_LABELS = '/content/models/research/object_detection/training/label_map.pbtxt'
NUM_CLASSES =1
PATH_TO_IMAGES = '/content/data/images/ShelfImages/'

In [0]:
# load frozen graph
detection_graph = tf.Graph()
with detection_graph.as_default():
    od_graph_def = tf.GraphDef()
    with tf.gfile.GFile(PATH_TO_MODEL, 'rb') as fid:
        serialized_graph = fid.read()
        od_graph_def.ParseFromString(serialized_graph)
        tf.import_graph_def(od_graph_def, name='')

In [0]:
# load categories (we have only 1 category pack)
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(
    label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)

In [0]:
# let's write function that executes detection
def run_inference_for_single_image(image, image_tensor, sess, tensor_dict):
    # Run inference
    expanded_dims = np.expand_dims(image, 0)
    output_dict = sess.run(tensor_dict, feed_dict={image_tensor: expanded_dims})
    # all outputs are float32 numpy arrays, so convert types as appropriate
    output_dict['num_detections'] = int(output_dict['num_detections'][0])
    output_dict['detection_classes'] = output_dict['detection_classes'][0].astype(np.uint8)
    output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
    output_dict['detection_scores'] = output_dict['detection_scores'][0]
    return output_dict

In [0]:
# it is useful to be able to run inference not only on the whole image,
# but also on its parts
# cutoff - minimum detection scrore needed to take box
def run_inference_for_image_part(image_tensor, sess, tensor_dict, 
                                 image, cutoff, ax0, ay0, ax1, ay1):
    boxes = []
    im = image[ay0:ay1, ax0:ax1]
    h, w, c = im.shape
    output_dict = run_inference_for_single_image(im, image_tensor, sess, tensor_dict)
    for i in range(100):
        if output_dict['detection_scores'][i] < cutoff:
            break
        y0, x0, y1, x1, score = *output_dict['detection_boxes'][i], \
                                output_dict['detection_scores'][i]
        x0, y0, x1, y1, score = int(x0*w), int(y0*h), \
                                int(x1*w), int(y1*h), \
                                int(score * 100)
        boxes.append((x0+ax0, y0+ay0, x1+ax0, y1+ay0, score))
    return boxes

In [0]:
# additional helper function to work not with coordinates but with percents
def run_inference_for_image_part_pcnt(image_tensor, sess, tensor_dict, 
                                 image, cutoff, p_ax0, p_ay0, p_ax1, p_ay1):
    h, w, c = image.shape
    max_x, max_y = w-1, h-1
    return run_inference_for_image_part(
                                image_tensor, sess, tensor_dict, 
                                image, cutoff, 
                                int(p_ax0*max_x), int(p_ay0*max_y), 
                                int(p_ax1*max_x), int(p_ay1*max_y))

In [0]:
# function to display image with bounding boxes

org = (100, 100) 
  
# fontScale 
fontScale = 3
   
# Blue color in BGR 
color = (0, 0, 255) 
  
# Line thickness of 2 px 
thickness = 5

def display_image_with_boxes(image, boxes, p_x0=0, p_y0=0, p_x1=1, p_y1=1):
    count=0
    font = cv2.FONT_HERSHEY_SIMPLEX 
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    for x0, y0, x1, y1, score in boxes:
        count+=1
        image = cv2.rectangle(image, (x0, y0), (x1, y1), (0,255,0), 5)
    image = cv2.putText(image, f'Total Number of product: {count}', org, font,fontScale, color, thickness, cv2.LINE_AA)
    if p_x0 != 0 or p_y0 !=0 or p_x1 != 1 or p_y1 != 1:
        h, w, c = image.shape
        max_x, max_y = w-1, h-1
        image = cv2.rectangle(image, 
                              (int(p_x0*max_x), int(p_y0*max_y)), 
                              (int(p_x1*max_x), int(p_y1*max_y)), (0,0,255), 5)
        
    
    plt.figure(figsize=(14, 14))
    return count, plt.imshow(image)

In [0]:
def initialize_graph():
    ops = tf.get_default_graph().get_operations()
    all_tensor_names = {output.name
                        for op in ops
                        for output in op.outputs}
    tensor_dict = {}
    for key in ['num_detections', 'detection_boxes',
                'detection_scores', 'detection_classes',
                'detection_masks']:
        tensor_name = key + ':0'
        if tensor_name in all_tensor_names:
            tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(tensor_name)
    image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')
    return image_tensor, tensor_dict


In [0]:
def non_max_suppression(boxes, overlapThresh):
    if len(boxes) == 0:
        return np.array([]).astype("int")

    if boxes.dtype.kind == "i":
        boxes = boxes.astype("float")

    
    pick = []

    x1 = boxes[:,0]
    y1 = boxes[:,1]
    x2 = boxes[:,2]
    y2 = boxes[:,3]
    sc = boxes[:,4]
 
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = np.argsort(sc)
 
    while len(idxs) > 0:
        last = len(idxs) - 1
        i = idxs[last]
        pick.append(i)
 
        xx1 = np.maximum(x1[i], x1[idxs[:last]])
        yy1 = np.maximum(y1[i], y1[idxs[:last]])
        xx2 = np.minimum(x2[i], x2[idxs[:last]])
        yy2 = np.minimum(y2[i], y2[idxs[:last]])
 
        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)
    
        #todo fix overlap-contains...
        overlap = (w * h) / area[idxs[:last]]
         
        idxs = np.delete(idxs, np.concatenate(([last],
            np.where(overlap > overlapThresh)[0])))
    
    return boxes[pick].astype("int")

In [0]:
def do_sliding_window_inference_with_nm_suppression(file, cutoff):
    with detection_graph.as_default():
        with tf.Session() as sess:
            image_tensor, tensor_dict = initialize_graph()
            image = cv2.imread(f'{PATH_TO_IMAGES}{file}')
            h, w, c = image.shape
            boxes = run_inference_for_image_part_pcnt(
                image_tensor, sess, tensor_dict, image, cutoff, 0, 0, 1, 1)
            a = np.array(boxes)
            mean_dx = int(np.mean(a[:,2]-a[:,0]))
            mean_dy = int(np.mean(a[:,3]-a[:,1]))
            step_x, step_y = mean_dx, mean_dy
            window_size = 2*mean_dy
            boxes = []
            y0 = 0
            while y0 < h-1:
                x0 = 0
                while x0 < w-1:
                    x1, y1 = x0 + window_size, y0 + window_size
                    boxes += run_inference_for_image_part(
                        image_tensor, sess, tensor_dict, image, cutoff, 
                        x0, y0, x1, y1)
                    x0 += step_y
                y0 += step_x
            boxes = non_max_suppression(np.array(boxes), 0.5)
    return display_image_with_boxes(image, boxes)

In [0]:
do_sliding_window_inference_with_nm_suppression('C3_P02_N1_S2_2.JPG', 0.5) 

In [0]:
##Creating JSON file 

import json 
  
# Data to be written 
img_dir = 'C3_P02_N1_S2_2.JPG'
counter,_ = do_sliding_window_inference_with_nm_suppression(img_dir, 0.5)
dictionary ={ 
    "file_name" : img_dir, 
    "no_of_product " : counter, 
} 
  
# Serializing json  
json_object = json.dumps(dictionary, indent = 4) 
  
# Writing to sample.json 
with open("sample.json", "w") as outfile: 
    outfile.write(json_object)