In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Dense, LSTM, Dropout, Embedding, Activation
from tensorflow.keras.layers import add, concatenate, BatchNormalization, Input
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
import matplotlib.pyplot as plt
import cv2

In [2]:
# using a kaggle dataset to train the model to generate image captions
def load_description(text): 
    mapping = dict() 
    for line in text.split("\n"): 
        token = line.split("\t") 
        if len(line)<2:   # remove short descriptions 
            continue
        img_id = token[0].split(".")[0] # name of the image
        img_des = token[1]              # description of the image
        if img_id not in mapping:
            mapping[img_id] = list()
        mapping[img_id].append(img_des)
    return mapping
token_path = "/Users/kashishmandhane/Documents/Kashish Data/LAPTOP STUFF/DJ Sanghvi College/Extra-curriculars/Hackathons/Ed-tech/Flickr8K/Flickr8k_text/Flickr8k.token.txt"
text = open(token_path, "r", encoding = "utf-8").read()
descriptions = load_description(text)
print(descriptions["1000268201_693b08cb0e"])

['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .']


In [None]:
# from utils.text_clean import clean_text
# for key, des_list in descriptions.items():
#     clean_text(des_list)
# # Now, the dictionary values (lists) will be cleaned
# print(descriptions["1000268201_693b08cb0e"])

In [None]:
def to_vocab(desc): 
	words = set() 
	for key in desc.keys(): 
		for line in desc[key]: 
			words.update(line.split()) 
	return words 
vocab = to_vocab(descriptions)

In [None]:
import glob

In [None]:
images = "/Users/kashishmandhane/Documents/Kashish Data/LAPTOP STUFF/DJ Sanghvi College/Extra-curriculars/Hackathons/Ed-tech/Flickr8K/Flicker8k_Images"
img = glob.glob(images + "*.jpg") # list of image names in folder

train_path = "/Users/kashishmandhane/Documents/Kashish Data/LAPTOP STUFF/DJ Sanghvi College/Extra-curriculars/Hackathons/Ed-tech/Flickr8K/Flickr8k_text/Flickr_8k.trainImages.txt"
train_images = open(train_path, "r", encoding = "utf-8").read().split("\n")
train_img = [] # list of all images in training set
for im in img:
	if(im[len(images):] in train_images):
		train_img.append(im)
# load descriptions of training set in a dictionary. Name of the image will act as key 
def load_clean_descriptions(des, dataset):
	dataset_des = dict()
	for key, des_list in des.items():
		if key+".jpg" in dataset:
			if key not in dataset_des:
				dataset_des[key] = list()
			for line in des_list:
				desc = "startseq " + line + " endseq"
				dataset_des[key].append(desc)
	return dataset_des
train_descriptions = load_clean_descriptions(descriptions, train_images) 
print(train_descriptions['1000268201_693b08cb0e'])

In [None]:
def preprocess_img(img_path):
    # inception v3 excepts img in 299 * 299 * 3
    img = load_img(img_path, target_size = (299, 299))
    x = img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis = 0)
    x = preprocess_input(x)
    return x
def encode(image):
    image = preprocess_img(image)
    vec = model.predict(image)
    vec = np.reshape(vec, (vec.shape[1]))
    return vec
  
base_model = InceptionV3(weights = "imagenet") 
model = Model(base_model.input, base_model.layers[-2].output) 
# run the encode function on all train images and store the feature vectors in a list 
encoding_train = {} 
for img in train_img:
    encoding_train[img[len(images):]] = encode(img)

In [None]:
# list of all training captions 
all_train_captions = [] 
for key, val in train_descriptions.items(): 
	for caption in val:
		all_train_captions.append(caption)
# consider only words which occur atleast 10 times 
vocabulary = vocab 
threshold = 10 # you can change this value according to your need 
word_counts = {} 
for cap in all_train_captions: 
	for word in cap.split(' '): 
		word_counts[word] = word_counts.get(word, 0) + 1
vocab = [word for word in word_counts if word_counts[word] >= threshold]
# word mapping to integers
ixtoword = {}
wordtoix = {}
ix = 1
for word in vocab:
	wordtoix[word] = ix
	ixtoword[ix] = word
	ix += 1	
# find the maximum length of a description in a dataset 
max_length = max(len(des.split()) for des in all_train_captions) 
max_length