<a href="https://colab.research.google.com/github/Iam-Divyesh/Image_Captioning/blob/main/COCO_Image_Captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install keras
!pip install tensorflow

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip /content/drive/MyDrive/FlickerDataset/Flicker8k_Dataset.zip

In [8]:
from os import listdir
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model

# extract features from each photo in the directory
def extract_features(directory):
	# load the model
	model = VGG16()
	# re-structure the model
	model.layers.pop()
	model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
	# summarize
	print(model.summary())
	# extract features from each photo
	features = dict()
	for name in listdir(directory):
		# load an image from file
		filename = directory + '/' + name
		image = load_img(filename, target_size=(224, 224))
		# convert the image pixels to a numpy array
		image = img_to_array(image)
		# reshape data for the model
		image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
		# prepare the image for the VGG model
		image = preprocess_input(image)
		# get features
		feature = model.predict(image, verbose=0)
		# get image id
		image_id = name.split('.')[0]
		# store feature
		features[image_id] = feature
		print('>%s' % name)
	return features

# extract features from all images
directory = 'Flicker8k_Dataset'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
# save to file
dump(features, open('features.pkl', 'wb'))

None


Expected: ['keras_tensor_23']
Received: inputs=Tensor(shape=(1, 224, 224, 3))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
>482882719_165722082d.jpg
>2396025708_e4a72e2558.jpg
>3420469425_2980b4cd30.jpg
>473220329_819a913bbb.jpg
>3661072592_2e693cd5a0.jpg
>3737711435_113ccd0a52.jpg
>1424775129_ffea9c13ab.jpg
>2734669176_c272b42597.jpg
>2734219983_fe86a60bf9.jpg
>2600442766_e750ec9a56.jpg
>2283966256_70317e1759.jpg
>3262760716_1e9734f5ba.jpg
>2045562030_654ddea5e5.jpg
>3683592946_262e9bfbfd.jpg
>3544233095_4bca71df1d.jpg
>371522748_dc557bcd6c.jpg
>3387661249_33e5ba0bc5.jpg
>3269380710_9161b0bd00.jpg
>3212465975_b657f40eed.jpg
>1794818900_e0ffdd268e.jpg
>873633312_a756d8b381.jpg
>502115726_927dd684d3.jpg
>3333921867_6cc7d7c73d.jpg
>464251704_b0f0c4c87a.jpg
>2363006088_b3e3aa5c0b.jpg
>3484019369_354e0b88c0.jpg
>470373679_98dceb19e7.jpg
>3247693965_845b3b4349.jpg
>2513260012_03d33305cf.jpg
>2059842472_f4fb61ea08.jpg
>3006926228_cf3c067b3e.jpg
>536721406_884ab8fece.jpg
>269361490_a22ae818bf.jpg
>3707283973_5cdaa39340.jpg
>3060969260_08f43e4f4f.jpg

In [10]:
import string

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# extract descriptions for images
def load_descriptions(doc):
	mapping = dict()
	# process lines
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# take the first token as the image id, the rest as the description
		image_id, image_desc = tokens[0], tokens[1:]
		# remove filename from image id
		image_id = image_id.split('.')[0]
		# convert description tokens back to string
		image_desc = ' '.join(image_desc)
		# create the list if needed
		if image_id not in mapping:
			mapping[image_id] = list()
		# store description
		mapping[image_id].append(image_desc)
	return mapping

def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			desc = [w.translate(table) for w in desc]
			# remove hanging 's' and 'a'
			desc = [word for word in desc if len(word)>1]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)

# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
	# build a list of all description strings
	all_desc = set()
	for key in descriptions.keys():
		[all_desc.update(d.split()) for d in descriptions[key]]
	return all_desc

# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

# filename = 'Flickr8k_text/Flickr8k.token.txt'
filename = 'Flickr8k_text/Flickr8k.token.txt'
# load descriptions
doc = load_doc(filename)
# parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))
# clean descriptions
clean_descriptions(descriptions)
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))
# save to file
save_descriptions(descriptions, 'descriptions.txt')

Loaded: 8092 
Vocabulary Size: 8763
