In [1]:
import sys
sys.path.append('./')
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Dense, Flatten
from keras.layers import Convolution1D
from keras.layers import MaxPooling1D
from keras.layers import Embedding
from keras.layers import ThresholdedReLU
from keras.layers import Dropout
from keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping
from keras import layers
from keras import models
import h5py
from keras.models import Model
from keras import Input
from keras import optimizers
import os, shutil
from os import listdir
from os.path import isfile, join, isdir

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Get dataset classes

In [2]:
classes = {}
images_path = 'data/tobacco3482jpg/'
classes_dirs = [f for f in listdir(images_path) if isdir(join(images_path, f))]
print('Classes Found:')
for image_class in classes_dirs:
    classes[image_class] = {}
    print(image_class)

Classes Found:
Note
News
Scientific
Memo
Email
Form
ADVE
Resume
Letter
Report


# Create train and validation dir for each class

In [3]:
base_dir = 'data/tobacco3482jpg/'


train_dir = os.path.join(base_dir, 'train')
os.mkdir(train_dir)

validation_dir = os.path.join(base_dir, 'validation')
os.mkdir(validation_dir)

for image_class in classes_dirs:
    
    classes[image_class]['train_dir'] = os.path.join(train_dir, image_class)
    os.mkdir(classes[image_class]['train_dir'])

    classes[image_class]['validation_dir'] = os.path.join(validation_dir, image_class)
    os.mkdir(classes[image_class]['validation_dir'])



# get all images file name for each class

In [4]:
for image_class in classes_dirs:
    images_path = 'data/tobacco3482jpg/{}/'.format(image_class)
    classes[image_class]['images'] = [f for f in listdir(images_path) if isfile(join(images_path, f))]
    

# set validation and train size
### Downsample_factor is used to get only a % of the data

In [5]:
train_size = 0.7
validation_size = 1 - train_size
downsample_factor = 1

for image_classe, class_data in classes.items():
    class_data['len_train'] = int(len(class_data['images']) * train_size * downsample_factor)
    class_data['len_validation'] = int(len(class_data['images']) * validation_size * downsample_factor)

# Move files inside class folders to their respective folder inside train and validation folder

In [8]:
for image_class, class_data in classes.items():
    original_dir = base_dir + image_class + '/'
    images = class_data['images']
    len_train = class_data['len_train']
    len_validation = class_data['len_validation']
    for file in images[:len_train]:        
        src = os.path.join(original_dir, file)
        dst = os.path.join(class_data['train_dir'], file)
        shutil.copyfile(src, dst)
    
    for file in images[len_train:len_train + len_validation]:
        src = os.path.join(original_dir, file)
        dst = os.path.join(class_data['validation_dir'], file)
        shutil.copyfile(src, dst)



In [9]:
for image_class, class_data in classes.items():
    print('\ntotal training {} images:'.format(image_class), len(os.listdir(class_data['train_dir'])))
    print('total validation {} images:'.format(image_class), len(os.listdir(class_data['validation_dir'])))


total training Note images: 28
total validation Note images: 12

total training News images: 26
total validation News images: 11

total training Scientific images: 36
total validation Scientific images: 15

total training Memo images: 86
total validation Memo images: 37

total training Email images: 84
total validation Email images: 36

total training Form images: 60
total validation Form images: 25

total training ADVE images: 32
total validation ADVE images: 13

total training Resume images: 16
total validation Resume images: 7

total training Letter images: 79
total validation Letter images: 34

total training Report images: 37
total validation Report images: 15


In [10]:
image_height = 150
image_width = 150

In [11]:
def nll1(y_true, y_pred):
    """ Negative log likelihood. """

    # keras.losses.binary_crossentropy give the mean
    # over the last axis. we require the sum
    return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)

# CNN

In [12]:
image_input = Input(shape=(image_height, image_width, 1), name='sent_input', dtype='float32')
x = layers.Conv2D(20, (7, 7), activation='relu',input_shape=(image_height, image_width, 1))(image_input)
x = layers.MaxPooling2D((4, 4))(x)
x = layers.Conv2D(50, (5, 5), activation='relu')(x)
x = layers.MaxPooling2D((4, 4))(x)
x = layers.Flatten()(x)
x = layers.Dense(1024, activation='relu')(x)
x = layers.Dense(1024, activation='relu')(x)
output = layers.Dense(10, activation='softmax')(x)

model_func = Model(image_input, output)

Instructions for updating:
Colocations handled automatically by placer.


In [13]:
sgd = optimizers.SGD(lr=0.01)

model_func.compile(loss=nll1,
optimizer=sgd,
metrics=['acc'])

# Create generator to train on batch

In [18]:
from keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(image_height, image_width),
    batch_size=20,
    class_mode='categorical',
    subset='training',
    color_mode='grayscale')

validation_generator = train_datagen.flow_from_directory(
    validation_dir,
    target_size=(image_height, image_width),
    batch_size=20,
    class_mode='categorical',
    subset='validation',
    color_mode='grayscale')

print(len(train_generator))
print(len(validation_generator))


Found 391 images belonging to 10 classes.
Found 38 images belonging to 10 classes.
20
2


# Train model 

In [19]:
history = model_func.fit_generator(
train_generator,
steps_per_epoch=train_generator.samples,
epochs=8,
validation_data=validation_generator,
validation_steps=validation_generator.samples,
verbose=1)


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
 49/391 [==>...........................] - ETA: 1:38 - loss: 0.0054 - acc: 1.0000

KeyboardInterrupt: 

# Save Model

In [None]:
model_func.save('document_classification.h5')