# 1. Import Dependencies

In [None]:
!pip install opencv-python

In [11]:
# Import opencv
import cv2 

# Import uuid
import uuid

# Import Operating System
import os

# Import time
import time

# 2. Define Images to Collect

In [12]:
labels = ['store', 'details', 'content', 'balance']
number_imgs = 3

# 3. Setup Folders 

In [13]:
IMAGES_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'collectedimages')
TEST_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'test')
TRAIN_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'train')

In [6]:
if not os.path.exists(IMAGES_PATH):
    if os.name == 'posix':
        !mkdir -p {IMAGES_PATH}
    if os.name == 'nt':
         !mkdir {IMAGES_PATH}
if not os.path.exists(TEST_PATH):
    if os.name == 'posix':
        !mkdir -p {TEST_PATH}
    if os.name == 'nt':
         !mkdir {TEST_PATH}
if not os.path.exists(TRAIN_PATH):
    if os.name == 'posix':
        !mkdir -p {TRAIN_PATH}
    if os.name == 'nt':
         !mkdir {TRAIN_PATH}
for label in labels:
    path = os.path.join(IMAGES_PATH, label)
    if not os.path.exists(path):
        !mkdir {path}

# 4. Capture Images

##### Only if pictures were taken manually

In [9]:
for label in labels                                                         :
    files = os.listdir(os.path.join(IMAGES_PATH, label))
    for file in files:
        if ".jpg" in file:
            imgnameNew = os.path.join(IMAGES_PATH,label,label+'.'+'{}.jpg'.format(str(uuid.uuid1())))
            imgnameOld = os.path.join(IMAGES_PATH,label,file)
            os.rename(imgnameOld, imgnameNew)
    

In [8]:
from pdf2image import convert_from_path

for label in labels:
    dir = os.path.join(IMAGES_PATH, label)
    files = os.listdir(dir)
    for file in files:
        if '.pdf' in file:
            filePath = os.path.join(dir, file)
            images = convert_from_path(filePath)
            for i, image in enumerate(images):
                jpgName = os.path.join(dir, f'page_{i + 1}.jpg') 
                image.save(jpgName, 'JPEG')
            os.remove(filePath)

In [None]:
!pip install --upgrade pyqt5 lxml

In [14]:
LABELIMG_PATH = os.path.join('Tensorflow', 'labelimg')

In [None]:
if not os.path.exists(LABELIMG_PATH):
    !mkdir {LABELIMG_PATH}
    !git clone https://github.com/tzutalin/labelImg {LABELIMG_PATH}

In [None]:
if os.name == 'posix':
    !make qt5py3
if os.name =='nt':
    !cd {LABELIMG_PATH} && pyrcc5 -o libs/resources.py resources.qrc

In [None]:
!cd {LABELIMG_PATH} && python labelImg.py

# 6. Move them into a Training and Testing Partition

In [15]:
import shutil
def move_to(file, label, location):
    id = file.split('.')[1]
    img_path = os.path.join(IMAGES_PATH, label, file)
    xml_path = os.path.join(IMAGES_PATH, label, label + '.' + '{}.xml'.format(id))
    shutil.move(img_path, location)
    shutil.move(xml_path, location)

In [16]:
def delete_contents(path):
    for file in os.listdir(path):
        os.remove(os.path.join(path, file))

In [19]:
# Clear training and testing folders
delete_contents(TEST_PATH)
delete_contents(TRAIN_PATH)

In [20]:
# Seperate data
threshold = 2
for label in labels:
    files = os.listdir(os.path.join(IMAGES_PATH, label))
    numFiles = len(files)
    for file in files:
        if os.path.exists(os.path.join(IMAGES_PATH, label, file)) and file.endswith('.jpg'):
            if numFiles <= threshold:
                move_to(file, label, TEST_PATH)
            else:
                move_to(file, label, TRAIN_PATH)
            numFiles = len(os.listdir(os.path.join(IMAGES_PATH, label)))

# OPTIONAL - 7. Compress them for Colab Training

In [None]:
TRAIN_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'train')
TEST_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'test')
ARCHIVE_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'archive.tar.gz')

In [None]:
!tar -czf {ARCHIVE_PATH} {TRAIN_PATH} {TEST_PATH}