In [None]:
import sys
sys.path.append('./')
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Dense, Flatten
from keras.layers import Convolution1D
from keras.layers import MaxPooling1D
from keras.layers import Embedding
from keras.layers import ThresholdedReLU
from keras.layers import Dropout
from keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping
from keras import layers
from keras import models
import h5py
from keras.models import Model
from keras import Input
from keras import optimizers
import os, shutil
from os import listdir
from os.path import isfile, join, isdir

# Get dataset classes

In [None]:
classes = {}
images_path = 'data/tobacco3482jpg/'
classes_dirs = [f for f in listdir(images_path) if isdir(join(images_path, f))]
print('Classes Found:')
for image_class in classes_dirs:
    classes[image_class] = {}
    print(image_class)

# Create train and validation dir for each class

In [None]:
base_dir = 'data/tobacco3482jpg/'

train_dir = os.path.join(base_dir, 'train')
os.mkdir(train_dir)

validation_dir = os.path.join(base_dir, 'validation')
os.mkdir(validation_dir)

for image_class in classes_dirs:
    
    classes[image_class]['train_dir'] = os.path.join(train_dir, image_class)
    os.mkdir(classes[image_class]['train_dir'])

    classes[image_class]['validation_dir'] = os.path.join(validation_dir, image_class)
    os.mkdir(classes[image_class]['validation_dir'])

# Get all images file name for each class

In [None]:
for image_class in classes_dirs:
    images_path = 'data/tobacco3482jpg/{}/'.format(image_class)
    classes[image_class]['images'] = [f for f in listdir(images_path) if isfile(join(images_path, f))]
    

# Set validation and train size
### Downsample_factor is used to get only a % of the data

In [None]:
train_size = 0.7
validation_size = 1 - train_size
downsample_factor = 1

for image_classe, class_data in classes.items():
    class_data['len_train'] = int(len(class_data['images']) * train_size * downsample_factor)
    class_data['len_validation'] = int(len(class_data['images']) * validation_size * downsample_factor)

# Move files inside class folders to their respective folder inside train and validation folder

In [None]:
for image_class, class_data in classes.items():
    original_dir = base_dir + image_class + '/'
    images = class_data['images']
    len_train = class_data['len_train']
    len_validation = class_data['len_validation']
    for file in images[:len_train]:        
        src = os.path.join(original_dir, file)
        dst = os.path.join(class_data['train_dir'], file)
        shutil.copyfile(src, dst)
    
    for file in images[len_train:len_train + len_validation]:
        src = os.path.join(original_dir, file)
        dst = os.path.join(class_data['validation_dir'], file)
        shutil.copyfile(src, dst)



In [None]:
for image_class, class_data in classes.items():
    print('\ntotal training {} images:'.format(image_class), len(os.listdir(class_data['train_dir'])))
    print('total validation {} images:'.format(image_class), len(os.listdir(class_data['validation_dir'])))

# CNN

In [None]:
image_height = 150
image_width = 150

In [None]:
image_input = Input(shape=(image_height, image_width, 1), name='sent_input', dtype='float32')
x = layers.Conv2D(20, (7, 7), activation='relu',input_shape=(image_height, image_width, 1))(image_input)
x = layers.MaxPooling2D((4, 4))(x)
x = layers.Conv2D(50, (5, 5), activation='relu')(x)
x = layers.MaxPooling2D((4, 4))(x)
x = layers.Flatten()(x)
x = layers.Dense(1024, activation='relu')(x)
x = layers.Dense(1024, activation='relu')(x)
x = layers.Dropout(0.5, noise_shape=None, seed=None)(x)
output = layers.Dense(8, activation='softmax')(x)

model_func = Model(image_input, output)

In [None]:
def nll1(y_true, y_pred):
    """ Negative log likelihood. """
    return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)

In [None]:
sgd = optimizers.SGD(lr=0.01)

model_func.compile(loss=nll1,
optimizer=sgd,
metrics=['acc'])

# Create generator to train on batch

In [None]:
from keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(image_height, image_width),
    batch_size=20,
    class_mode='categorical',
    color_mode='grayscale')

validation_generator = train_datagen.flow_from_directory(
    validation_dir,
    target_size=(image_height, image_width),
    batch_size=20,
    class_mode='categorical',
    color_mode='grayscale')

print(len(train_generator))
print(len(validation_generator))


# Train model 

In [None]:
history = model_func.fit_generator(
train_generator,
steps_per_epoch=train_generator.samples,
epochs=8,
validation_data=validation_generator,
validation_steps=validation_generator.samples,
verbose=1)


# Save Model

In [None]:
model_func.save('document_classification.h5')