In [6]:
# Requires Python 3.7 & Tensorflow version 1.13.2
#!pip install tensorflow==1.13.2
#############################################################
##
##
## Download Model & photos and place the model in "content" and photos in "photos" folders
##                  (Link below)
##
##############################################################
#Model: https://mega.nz/file/TkMkyYgC#NpL8WcKHMsYEMf-QCikFuIKk3A7_061KbXuziCraPZs
#COCO photos: http://images.cocodataset.org/zips/train2017.zip

In [7]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import os
from os import listdir, path

import tensorflow as tf
import cv2
import numpy as np
import matplotlib.pyplot as plt
from IPython import display

import PIL.Image
import PIL.ImageOps
import PIL.ImageFont
import PIL.ImageDraw
import textwrap
from io import BytesIO

from im2txt import configuration
from im2txt import inference_wrapper
from im2txt.inference_utils import caption_generator
from im2txt.inference_utils import vocabulary

In [8]:
import os
import glob

# Get a list of all the file paths that ends with .txt from in specified directory
fileList = glob.glob('./content/model.ckpt-2000000.*')
# Iterate over the list of filepaths & remove each file.
for filePath in fileList:
    try:
        os.remove(filePath)
    except:
        print("Error while deleting file : ", filePath)

# Choose the trained model --> current is 2
model_number = "2"
model_path = "./content/model.ckpt-"+model_number+"000000"   # Give model path
vocab_path = "./content/word_counts"+model_number+".txt"     # Give word_counts file path
tf.logging.set_verbosity(tf.logging.INFO)

# Build the inference graph.
g = tf.Graph()
with g.as_default():
    model = inference_wrapper.InferenceWrapper()
    restore_fn = model.build_graph_from_config(configuration.ModelConfig(), model_path)
g.finalize()

# Create the vocabulary.
vocab = vocabulary.Vocabulary(vocab_path) 

#######################################################
### if tensorflow version is <1.13.2 then you have to check variables name as per tensorflow version

OLD_CHECKPOINT_FILE = "./content/model.ckpt-2000000"
NEW_CHECKPOINT_FILE = "./content/model.ckpt-2000000"

import tensorflow as tf
vars_to_rename = {
    "lstm/BasicLSTMCell/Linear/Matrix": "lstm/basic_lstm_cell/kernel",
    "lstm/BasicLSTMCell/Linear/Bias": "lstm/basic_lstm_cell/bias",
}
new_checkpoint_vars = {}
reader = tf.train.NewCheckpointReader(OLD_CHECKPOINT_FILE)
for old_name in reader.get_variable_to_shape_map():
  if old_name in vars_to_rename:
    new_name = vars_to_rename[old_name]
  else:
    new_name = old_name
  new_checkpoint_vars[new_name] = tf.Variable(reader.get_tensor(old_name))

init = tf.global_variables_initializer()
saver = tf.train.Saver(new_checkpoint_vars)

with tf.Session() as sess:
  sess.run(init)
  saver.save(sess, NEW_CHECKPOINT_FILE)

#######################################################

sess = tf.Session(graph=g)
# Load the model from checkpoint.
restore_fn(sess)

# Prepare the caption generator. Here we are implicitly using the default
# beam search parameters. See caption_generator.py for a description of the
# available beam search parameters.
generator = caption_generator.CaptionGenerator(model, vocab, beam_size=5)

INFO:tensorflow:Building model.
INFO:tensorflow:Initializing vocabulary from file: ./content/word_counts2.txt
INFO:tensorflow:Created vocabulary with 11520 words
INFO:tensorflow:Loading model from checkpoint: ./content/model.ckpt-2000000
INFO:tensorflow:Restoring parameters from ./content/model.ckpt-2000000
INFO:tensorflow:Successfully loaded checkpoint: model.ckpt-2000000


In [9]:
image_path = "./photos/"    #### provide path where image is stored
filename = listdir(image_path)
filenames = [f for f in filename if '.jpg' in f or '.png' in f or '.jpeg' in f]
resultsFile = input("Name of output file: ")
store = open('./content/results/' + resultsFile,'w')   #### directory to store captions file

for i, file in enumerate(filenames):
    try:
        img = PIL.Image.open(image_path+file).convert('RGBA')
        box = PIL.Image.new('RGBA', img.size, (255,255,255,0))
        draw = PIL.ImageDraw.Draw(box)
        image = open(image_path+file,'rb').read() # Read the image as bytes
        captions = generator.beam_search(sess, image)
        syntheticText = ""
        for caption in captions:
            # Ignore begin and end words.
            sentence = [vocab.id_to_word(w) for w in caption.sentence[1:-1]]
            sentence = " ".join(sentence)
            syntheticText = syntheticText + sentence + " "
        syntheticText = syntheticText.replace(".", " ")
        syntheticText = syntheticText.replace("   ", " ").replace("  ", " ").lower()
        file = file.split(".")[0]
        print(file+":%s" % (syntheticText))
        store.write(file+":" + syntheticText + "\n")
    except KeyboardInterrupt:
        store.close()
        break
store.close()

000000000009:a close up of a plate of food with broccoli a plastic container filled with different types of food a close up of a tray of food with broccoli a close up of a tray of food on a table a close up of a plate of food 
000000000025:a couple of giraffe standing next to each other a giraffe standing next to a tree in a forest a giraffe standing in the middle of a forest a couple of giraffe standing next to a tree a giraffe standing in the middle of a lush green field 
000000000030:a vase filled with flowers on top of a table a vase of flowers sitting on a table a vase filled with flowers sitting on top of a table a vase filled with flowers sitting on a table a vase of flowers sitting on a table 
000000000034:a zebra standing on top of a lush green field a zebra standing on top of a grass covered field a zebra grazing on grass in a field a zebra standing in a field of grass a zebra grazing in a field of grass 
000000000036:a woman holding an umbrella in the rain a woman is holding