In [1]:
from os import listdir, path
import xml.etree.ElementTree as ET

class data_manager():
    def __init__(self, image_filepath : str, annotation_filepath : str):
        self.image_filepath = image_filepath
        self.annotation_filepath = annotation_filepath
    
    def create_data_dict(self):
        # For each dict
        data_dict = {}
                            
        for folder in listdir(self.image_filepath):
            folder_path = self.image_filepath + '/' + folder
            for pciture in listdir(folder_path):
                image_path = folder_path + '/' + pciture
                # Get annotations
                image_number = pciture.split(".")[0]
                
                if not path.isfile(self.annotation_filepath + "/" + folder + "/" + image_number + ".eng"):
                    continue
                          
                # Makes special character beeing parsed
                parser = ET.XMLParser(encoding="ansi")
                
                tree = ET.parse(self.annotation_filepath + "/" + folder + "/" + image_number + ".eng", parser=parser)
                root = tree.getroot()
                tmp_dict = {}
                for child in root:
                    tmp_dict[child.tag] = child.text
                data_dict[image_number] = tmp_dict
        print("Parsed " + str(len(data_dict)) + " entries!")
        return data_dict

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.resnet50 import preprocess_input

# import sys
# from PIL import Image
# sys.modules['Image'] = Image 

class data_generator():
    
    def train_tokenizer(self, data_dict : dict):
        texts = []
        for key in data_dict:
            texts.append(data_dict[key]["DESCRIPTION"])
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(texts)
    
    def picture_data(self, data_dict : dict, batch_size=32):
        
        if not self.tokenizer:
            self.train_tokenizer(data_dict)
        
        samples_per_epoch = len(data_dict)
        number_of_batches = samples_per_epoch/batch_size
        counter=0
        
        while 1:
            for image_number in data_dict.keys():
                #print(data_dict[image_number]["IMAGE"])
                image = load_img("iaprtc12/" + data_dict[image_number]["IMAGE"], target_size=(224, 224))
                # convert the image pixels to a numpy array
                image = img_to_array(image)
                # reshape data for the model
                image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
                # prepare the image for the VGG model
                image = preprocess_input(image)
                
                caption = self.tokenizer.texts_to_sequences(data_dict[image_number]["DESCRIPTION"])
                
                counter += 1
                yield image, caption
        #restart counter to yeild data in the next epoch as well
        if counter >= number_of_batches:
            counter = 0

In [7]:
dm = data_manager("iaprtc12/images", "iaprtc12/annotations_complete_eng")
data_dict = dm.create_data_dict()

Parsed 19999 entries!


In [8]:
generator = data_generator()
generator.train_tokenizer(data_dict)

In [9]:
#Inspired by https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/

from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add

# define the captioning model
def define_model(vocab_size, max_length):
    # feature extractor model
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    # sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    # decoder model
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # summarize model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

# calculate the length of the description with the most words
def max_length(data_dict):
    #lines = to_lines(descriptions)
    max_length = 0
    for value in data_dict.values():
        max_length = max(max_length, len(value["DESCRIPTION"]))
    return max_length

vocab_size = len(generator.tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_length = max_length(data_dict)
print('Description Length: %d' % max_length)
 
# define the model
model = define_model(vocab_size, max_length)
# train the model, run epochs manually and save after each epoch
epochs = 20
steps = len(data_dict)
for i in range(epochs):
    # create the data generator
    # fit for one epoch
    model.fit_generator(generator.picture_data(data_dict), epochs=1, steps_per_epoch=steps, verbose=1)
    # save model
    model.save('model_' + str(i) + '.h5')

Vocabulary Size: 4555
Description Length: 420
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 420)          0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 420, 256)     1166080     input_4[0][0]                    
__________________________________________________________________________________________________
dropout_3 (Dropout)             (None, 4096)         0           input_3[0][0]                    
_______________________________________________________________

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([[[[ 98.061    ,  79.221    ,  59.32     ],
         [151.061    , 134.22101  , 116.32     ],
         [ 97.061    ,  76.221    ,  62.32     ],
         ...,
         [-27.939003 , -35.779    ,...

## ARCHIVE!!!

In [39]:
from os import listdir
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
 
# extract features from each photo in the directory
def extract_features(directory):
    # load the model
    model = VGG16()
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    # summarize
    print(model.summary())
    # extract features from each photo
    features = dict()
    for supdic in listdir(directory):
        path = directory + '/' + supdic
        for name in listdir(path):
            # load an image from file
            filename = path + '/' + name
            image = load_img(filename, target_size=(224, 224))
            # convert the image pixels to a numpy array
            image = img_to_array(image)
            # reshape data for the model
            image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
            # prepare the image for the VGG model
            image = preprocess_input(image)
            # get features
            feature = model.predict(image, verbose=0)
            # get image id
            image_id = name.split('.')[0]
            # store feature
            features[image_id] = feature
            print('>%s' % name)
    return features
 
# extract features from all images
directory = 'iaprtc12/images'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
# save to file
dump(features, open('features.pkl', 'wb'))

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5
  3694592/553467096 [..............................] - ETA: 10:29

KeyboardInterrupt: 