## Team Members:
* Erick Kramer
* Swaroop Bhandary
* Mihir Patil

## Neural Image Captioning

### Instructions: 
Implement the neural image captioning model (NIC) as described previously in the lectures for the the IAPR TC-12 dataset.

Dataset can be found here: https://www.imageclef.org/photodata

Original paper can be found here: https://arxiv.org/abs/1411.4555

EVALUATION:

- Data manager class  for loading dataset.

- The dataset to be loaded should be a dictionary including as keys the image path and as respective values the description of that image (this alleviates the memory resources).

- Generator class that takes the data dictionary constructed by the data manager class. This class generates batches for the network.

- NIC model.

- Display of training loss and training accuracy.

- Display descriptions generated by the validation/test images.

- Display of random images taken from the internet. 

TIPS:

- REUSE your code from the previous exercise that included generation of text. 

- Base your data manager solution from: https://github.com/oarriaga/neural_image_captioning

- Give proper credit when using explicitly code of others.

### Data Manager

In [5]:
from collections import Counter
from itertools import chain
import os
import pickle
from string import digits
import time
import h5py
import numpy as np 
import pandas as np

class DataManager(object):
    '''
    Class to manage and preprocess data. 
    
    # Disclaimer: 
    This class was developed based on the code presented in: https://github.com/oarriaga/neural_image_captioning/blob/master/src/data_manager.py
    
    # Arguments:
    * data_filename: File containing in every row the caption and 
    the image name, separated by a special character, i.e. separator
    * extract_image_features: Flag to create a h5py file that contains
    a vector of features extracted by a pre-trained CNN 
    * image_directory: Path to the images to extract the features    
    '''
    
    def __init__(self, data_filename, max_caption_length=20., separator="*",
                word_frequency_threshold=2, randomize_data=True,
                split_data_flag=True, extract_image_features=False,
                image_directory=None, dump_path='preprocessed_data',
                extractor_model='inception'):
        self.data_filename            = data_filename 
        self.max_caption_length       = max_caption_length
        self.separator                = separator
        self.word_frequency_threshold = word_frequency_threshold
        self.randomize_data           = randomize_data
        self.split_data_flag          = split_data_flag
        self.extract_image_features   = extract_image_features
        self.image_directory          = image_directory
        self.dump_path                = dump_path
        self.extractor_model          = extractor_model
               
        self.current_directory = os.getcwd()
        # Tag format 
        self.BOS = '<S>' # Beginning of sentence
        self.EOS = '<E>' # End of sentence
        self.PAD = '<P>' # TODO: Define
        
        self.word_frequencies = None
        self.captions = None
        self.image_files = None
        self.image_features = None
        self.word_to_id = None
        self.id_to_word = None
        self.extracted_features = None
        self.features_file_names = None
        self.image_feature_files = None
        self.elapsed_time = None
        
        # To extract image features assert directory is given
        if self.extract_image_features:
            assert self.image_directory != None
        
        def preprocess(self):
            '''
            Preprocessing function does the following:
            * remove long captions
            * get dataset statistics
            * remove words with frequency below the threshold
            * create image dictionaries
            * get image features (if requested)
            '''
            start_time = time.monotonic() # Clock not affected by system  clock updates
            self.load(self.data_filename)
            self.remove_long_captions()
            self.get_corpus_statistics()
            self.remove_infrequent_words()
            self.construct_dictionaries()
            if self.extract_image_features:
                self.get_image_features(self.image_directory)
                self.move_to_path()
                self.write_image_features_to_h5()
            self.move_to_path()
            self.write_data()
            self.write_dictionaries()
            self.elapsed_time = time.monotonic() - start_time
            self.write_parameters()
            if self.split_data_flag:
                self.split_data()
            self.move_path_back()
            
        def load(self, data_filename):
            '''
            Loads the image dataset
            '''
            data = pd.read_table(data_filename, sep=self.separator)
            data = np.assarray(data)
            if self.randomize_data:
                np.random.shuffle(data)
            self.image_files = data[:,0]
            self.captions = data[:,1]
            num_captions = self.image_files.shape[0]
            print('Loaded dataset with ', num_captions,' images')
            
        def remove_long_captions(self):
            '''
            Remove captions that contains more than Max number
            of characters
            '''
            print('Removing captions with more than ', 
                  self.max_caption_length, ' characters')
            
            kept_image_files = []
            kept_captions = []
            original_file_size = len(self.captions)
            
            for caption_idx, caption in enumerate(self.captions):
                filtered_caption = self.filter_caption(caption)
                if (len(filtered_caption) <= self.max_caption_length):
                    kept_captions.append(filtered_caption)
                    kept_image_files.append(self.image_files[caption_idx])
                    
            self.captions = kept_captions
            self.image_files = kept_image_files
        
        def filter_caption(self, caption):
            '''
            Function that removes undesired characters, lowercase sentence,
            remove quotes, and returns sentence as a list of words.
            '''
            undesired_chars = digits + ";.,'/*?¿><:{}[\]|+"
            char_translator = str.maketrans('', '', undesired_chars)
            quotes_translator = str.maketrans('','','"')
            filtered_caption = caption.strip().lower() #lowercase
            filtered_caption = filtered_caption.translate(char_translator)
            filtered_caption = filtered_caption.translate(quotes_translator)
            filtered_caption = filtered_caption.split(' ') # list of words
            return filtered_caption
        
        def get_corpus_statistics(self):
            '''
            Compute word frequencies
            '''
            self.word_frequencies = Counter(chain(*self.captions))\
            .most_common()
            
        def remove_sparse_words(self):
            '''
            Remove words with a frequency below a treshold
            '''
            print('Removing words with a frequency less than ',
                 self.word_frequency_threshold)
            
            # PENDING
        
        def construct_dictionaries(self):
            '''
            Computes words_id, and id_words dictionaries
            '''
            words = self.word_frequencies[:,0]
            self.word_to_id{self.PAD:0, self.BOS:1, self.EOS:2}
            self.word_to_id.update({word:word_id for word_id, word in
                                   enumerate(words, 3)})
            
            self.id_to_word = {word_id:word for word, word_id in
                                   self.word_to_id.items()}
        def get_image_features(self, image_directory):
            
            from keras.preprocesssing import image
            from keras.models import Model
            
            if self.extractor_model == 'vgg16':
                from keras.applications.vgg16 import preprocess_input
                from keras.applications import VGG16
                
                self.IMG_FEATURES = 4096
                base_model = VGG16(weights='imagenet')
                model = Model(input=base_model.input,
                              output=base_model.get_layer('fc2').output)
                self.extract_features = []
                self.image_feature_files = list(set(self.image_files))
                num_images = len(self.image_feature_files)
                
                for image_idx, image_file in enumerate(self.image_feature_files):
                    image_path = image_directory + image_file # Not sure if dir needed
                    if image_idx%100 == 0:
                    print('%.2f %% completed' %
                            round(100*image_idx/num_images,2))
                    img = image.load_img(image_path, target_size=(224,224))
                    img = image.img_to_array(img)
                    img = np.expand_dims(img, axis=0)
                    img = preprocess_input(img)
                    CNN_features = model.predict(img)
                    self.extracted_features.append(np.squeeze(CNN_features))
                self.extracted_features = np.asarray(self.extracted_features)
                
            elif self.extractor_model == 'vgg19':
                from keras.applications.vgg19 import preprocess_input
                from keras.applications import VGG19
                
                self.IMG_FEATURES = 4096
                base_model = VGG19(weights='imagenet')
                model = Model(input=base_model.input,
                              output=base_model.get_layer('fc2').output)
                self.extract_features = []
                self.image_feature_files = list(set(self.image_files))
                num_images = len(self.image_feature_files)
                
                for image_idx, image_file in enumerate(self.image_feature_files):
                    image_path = image_directory + image_file # Not sure if dir needed
                    if image_idx%100 == 0:
                    print('%.2f %% completed' %
                            round(100*image_idx/num_images,2))
                    img = image.load_img(image_path, target_size=(224,224))
                    img = image.img_to_array(img)
                    img = np.expand_dims(img, axis=0)
                    img = preprocess_input(img)
                    CNN_features = model.predict(img)
                    self.extracted_features.append(np.squeeze(CNN_features))
                self.extracted_features = np.asarray(self.extracted_features)
                
            elif self.extractor_model == 'inception':
                from keras.applications.inception_v3 import preprocess_input
                from keras.applications import InceptionV3
                
                self.IMG_FEATURES = 2048
                base_model = InceptionV3(weights='imagenet')
                model = Model(input=base_model.input,
                              output=base_model.get_layer('flatten').output)
                self.extract_features = []
                self.image_feature_files = list(set(self.image_files))
                num_images = len(self.image_feature_files)
                
                for image_idx, image_file in enumerate(self.image_feature_files):
                    image_path = image_directory + image_file # Not sure if dir needed
                    if image_idx%100 == 0:
                    print('%.2f %% completed' %
                            round(100*image_idx/num_images,2))
                    img = image.load_img(image_path, target_size=(224,224))
                    img = image.img_to_array(img)
                    img = np.expand_dims(img, axis=0)
                    img = preprocess_input(img)
                    CNN_features = model.predict(img)
                    self.extracted_features.append(np.squeeze(CNN_features))
                self.extracted_features = np.asarray(self.extracted_features)

        def write_image_features_to_h5(self):
            '''
            Save the features in a h5 file
            '''
            dataset_file = h5py.File(self.extractor_model + 
                                    '_image_name_to_feature.h5')
            num_features = len(self.image_feature_files)
            for image_idx, image_file in enumerate(self.image_feature_files):
                file_id = dataset_file.create_group(image_file)
                image_data = file_id.create_dataset('image_features',
                                                   (self.IMG_FEATURES,), dtype='float32')
                image_data[:] = self.extracted_features[image_idx]
                
                if image_idx%100 == 0:
                    print('Number of image processed: ', image_idx)
                    print('Number of image remaining: ',
                          num_features - image_idx)
            dataset_file.close()
            
        def write_image_feature_files(self):
            pickle.dump(self.image_feature_files,
                       open('image_feature_files.p', 'wb'))
        
        def write_dictionaries(self):
            '''
            Save dictionaries in pickle files
            '''
            pickle.dump(self.word_to_id, open('word_to_id.p', 'wb'))
            pickle.dump(self.id_to_word, open('id_to_word.p', 'wb'))
        
        def write_image_features(self):
            pickle.dump(self.extracted_features,
                       open('extracted_features.p', 'wb'))
        
        def write_parameters(self):
            log_file = open('data_parameters.log','w')
            log_file.write('data_filename %s \n' %self.data_filename)
            log_file.write('dump_path %s \n' %self.dump_path)
            log_file.write('BOS: %s \n' % self.BOS)
            log_file.write('EOS: %s \n' % self.EOS)
            log_file.write('PAD: %s \n' % self.PAD)
            log_file.write('IMG_FEATS: %s \n' %self.IMG_FEATURES)
            log_file.write('word_frequency_threshold: %s \n'
                        %self.word_frequency_treshold)
            log_file.write('max_caption_length: %s \n'
                        %self.max_caption_length)
            log_file.write('initial_data_size: %s \n'
                        %self.initial_number_of_captions)
            log_file.write('captions_larger_than_threshold: %s \n'
                        %self.number_of_captions_removed)
            log_file.write('current_data_size: %s \n'
                        %self.current_number_of_captions)
            log_file.write('initial_word_size: %s \n'
                        %self.initial_number_of_words)
            log_file.write('words_removed_by_frequency_threshold %s \n'
                        %self.number_of_words_removed)
            log_file.write('current_word_size: %s \n'
                        %self.current_number_of_words)
            log_file.write('cnn_extractor: %s \n' %self.extractor_model)
            log_file.write('elapsed_time: %s' %self.elapsed_time)
            log_file.close()
        
        def split_data(self, train_porcentage=.80):

            complete_data = pd.read_table('iaprtc12_images_annotations.txt',sep='*')
            data_size = complete_data.shape[0]
            training_size = int(data_size*train_porcentage)
            complete_training_data = complete_data[0:training_size]
            test_data = complete_data[training_size:]
            test_data.to_csv('test_data.txt',sep='*',index=False)
            # splitting between validation and training 
            training_size = int(training_size*train_porcentage)
            validation_data = complete_training_data[training_size:]
            training_data = complete_training_data[0:training_size]
            validation_data.to_csv('validation_data.txt',sep='*',index=False)
            training_data.to_csv('training_data.txt',sep='*',index=False)
                
                
                
                        
                    
            
            
            
        
        
        

In [140]:
str.maketrans?

#### Create dataset file

In [None]:
data_file = 'iaprtc12_images_annotations.txt' 

# Get image filenames
images = []
for root, dirs, files in os.walk('iaprtc12/images/'):
    for name in files:
        images.append(os.path.join(root,name))
images = sorted(images) 

# Get annotations filenames
annotations = []
for root, dirs, files in os.walk('iaprtc12/annotations_complete_eng/'):
    for name in files:
        annotations.append(os.path.join(root,name))
annotations = sorted(annotations)


w_f = open(data_file, 'w')
# Getting captions 
for image_idx, image in enumerate(images):
    # Check image and annotations file names match
    if image.replace('iaprtc12/images/','').strip('.jpg') == \
           annotations[image_idx].replace('iaprtc12/annotations_complete_eng/','')\
           .strip('.eng'):
                pass
    else:
        print('image ',image.replace('iaprtc12/images/','').strip('.jpg'), 
        'annotation ', annotations[image_idx].replace('iaprtc12/annotations_complete_eng/','')\
           .strip('.eng'))
    # Get caption
#     print(image_idx)
    f = open(annotations[image_idx], 'rb')
    for line in f:
        line = line.decode("latin-1")
        if '<DESCRIPTION>' in line:
            # remove unncessary information
            line = line.strip('<DESCRIPTION>').replace(';','')
            try:
                # Some annotations do not follow the same format :@
                caption = line[:line.index('</')] 
            except:
                caption = line
            break
    # Create output line
    output_line = image + '*' + caption + '\n'
    # Open output file
    w_f.write(output_line)
w_f.close()
    

### Generator

### Model

### Training