### ProjF3 - Baseline Model

Use this document as a template to provide the evaluation of your baseline model. You are welcome to go in as much depth as needed.

Make sure you keep the sections specified in this template, but you are welcome to add more cells with your code or explanation as needed.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from collections import Counter
from PIL import Image
from keras.preprocessing.image import load_img, img_to_array
from IPython.display import display
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
import string
import cv2
import random
import pickle
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.applications.vgg16 import preprocess_input
import re
import warnings
warnings.filterwarnings('ignore')
random.seed(100)
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from numpy import array

from tensorflow import keras


from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model
from keras.layers import Input, Dense, Embedding, Dropout, add, LSTM
from keras.callbacks import ModelCheckpoint
from pickle import load

from numpy import argmax
from pickle import load
from keras.models import load_model
from pickle import dump
import pickle

### Load and Prepare Data

This should illustrate your code for loading the dataset and the split into training, validation and testing. You can add steps like pre-processing if needed.

In [None]:
### YOUR CODE HERE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# load dataset
images_path = '/content/drive/MyDrive/Flicker8k_Dataset'
captions_path = '/content/drive/MyDrive/Flickr8k_text/Flickr8k.token.txt'
working_dir = '/content/drive/MyDrive/CNN_RNN_Dataset'

In [None]:
# load dataset
df_dataset = pd.read_csv('/content/drive/MyDrive/image_caption_data.csv')
df_dataset.head()

Unnamed: 0,index,filename,caption
0,0,1000268201_693b08cb0e.jpg,a child in a pink dress is climbing up a set o...
1,1,1000268201_693b08cb0e.jpg,a girl going into a wooden building .
2,2,1000268201_693b08cb0e.jpg,a little girl climbing into a wooden playhouse .
3,3,1000268201_693b08cb0e.jpg,a little girl climbing the stairs to her playh...
4,4,1000268201_693b08cb0e.jpg,a little girl in a pink dress going into a woo...


In [None]:
dataset_filenames = df_dataset[df_dataset['filename'].str.endswith('.jpg')]['filename']
dataset_filenames = dataset_filenames.unique()
len(dataset_filenames)

8091

Train/Val/Test data split

In [None]:
from sklearn.model_selection import train_test_split

train_filenames, test_filenames = train_test_split(dataset_filenames, test_size=0.2, random_state=42)
val_filenames, test_filenames = train_test_split(test_filenames, test_size=0.5, random_state=42)

In [None]:
df_train_dataset = df_dataset[df_dataset['filename'].isin(train_filenames)]
df_val_dataset = df_dataset[df_dataset['filename'].isin(val_filenames)]
df_test_dataset = df_dataset[df_dataset['filename'].isin(test_filenames)]

In [None]:
print(len(train_filenames))
print(len(val_filenames))
print(len(test_filenames))

6472
809
810


In [None]:
df_train_dataset.to_csv('image_caption_train_data.csv', index=False)
df_val_dataset.to_csv('image_caption_val_data.csv', index=False)
df_test_dataset.to_csv('image_caption_test_data.csv', index=False)

# Text(Caption) Preprocessing Methods
1. Remove punctuation
2. Remove multiple spaces
3. Convert all words to lowercase
4. Remove numerical characters
5. Remove single character words
6. Lemmatize the tokens

In [None]:
# method to read the data from the text file
def read_data_file(filepath):
  file = open(filepath, 'r', encoding='utf8')
  text = file.read()
  file.close()
  return text

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# function that removes punctuation in the sentences
def remove_punctuation(text_original):
  text_without_punct = ''.join([char for char in text_original if char not in string.punctuation])
  return text_without_punct

# convert all words into lowercase
def convert_to_lowercase(text):
  return text.lower()

# function to remove multiple
def remove_multiple_spaces(text):
  return re.sub(r'\s{2,}', ' ', text)

# create word_tokens
def word_tokens(text):
  return word_tokenize(text)

# function that removes numerical values
def remove_numeric(tokens):
  return [word for word in tokens if word.isalpha()]

# function that removes single character
def removing_single_char(tokens):
  return [word for word in tokens if len(word) > 1]

# lemmatize the text
def lemmatize_text(tokens):
  lemmatizer = WordNetLemmatizer()
  return " ".join([lemmatizer.lemmatize(word) for word in tokens])

def text_clean(text_original):
  text = remove_punctuation(text_original)
  text = convert_to_lowercase(text)
  text = remove_multiple_spaces(text)
  tokens = word_tokens(text)
  tokens = remove_numeric(tokens)
  tokens = removing_single_char(tokens)
  text = lemmatize_text(tokens)
  return text

In [None]:
# mapping the images to captions by creating a dictionary where keys are the image titles and values are list of captions
def image_caption_mapping(dataframe):
  img_caption_map = {}

  for index, row in dataframe.iterrows():
    caption = row['caption']
    image_name = row['filename']

    if image_name.endswith('.jpg'):
      if image_name in img_caption_map:
        img_caption_map[image_name].append('startseq ' + text_clean(caption) + ' endseq')
      else:
        img_caption_map[image_name] = ['startseq ' + text_clean(caption) + ' endseq']

  return img_caption_map


In [None]:
train_img_caption_clean = image_caption_mapping(df_train_dataset)

In [None]:
all_words = []
for caption in df_train_dataset['caption']:
  words = word_tokenize(caption)
  all_words.extend(words)

unique_words = set(all_words)
print(unique_words)



In [None]:
print(len(all_words))
print(len(unique_words))

381544
8095


In [None]:
def get_all_captions(datamap):
  all_captions = []
  for key, values in datamap.items():
    for caption in values:
      all_captions.append(caption)
  return all_captions

In [None]:
all_train_captions = get_all_captions(train_img_caption_clean)

In [None]:
all_train_captions[:5]

['startseq child in pink dress is climbing up set of stair in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stair to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq']

In [None]:
# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_train_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
max_length = max(len(caption.split()) for caption in all_train_captions)

In [None]:
print(max_length)
print(vocab_size)

34
6867


In [None]:
with open('/content/drive/MyDrive/train_data_sequences.pkl', 'wb') as f:
    pickle.dump(train_img_caption_clean, f)

In [None]:
val_img_caption_clean = image_caption_mapping(df_val_dataset)

with open('/content/drive/MyDrive/val_data_sequences.pkl', 'wb') as f:
    pickle.dump(val_img_caption_clean, f)

In [None]:
test_img_caption_clean = image_caption_mapping(df_test_dataset)

with open('/content/drive/MyDrive/test_data_sequences.pkl', 'wb') as f:
    pickle.dump(test_img_caption_clean, f)

Image feature extraction

In [None]:
cnn_model = VGG16(weights='imagenet')
cnn_model_new = Model(inputs=cnn_model.inputs, outputs=cnn_model.layers[-2].output)
cnn_model_new.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     14758

In [None]:
def extract_img_features(img_list):
  img_features = dict()
  target_size = (224, 224)
  for img in img_list:
    img_path = images_path + '/' + img
    if os.path.exists(img_path):
      image = load_img(img_path, target_size = target_size)
      image = img_to_array(image)
      image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
      image = preprocess_input(image)
      features = cnn_model_new.predict(image)
      img_features[img] = features
  return img_features

In [None]:
train_image_features = extract_img_features(train_filenames)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
with open('/content/drive/MyDrive/train_image_features_vgg16.pkl', 'wb') as f:
    pickle.dump(train_image_features, f)

In [None]:
val_image_features = extract_img_features(val_filenames)
with open('/content/drive/MyDrive/val_image_features_vgg16.pkl', 'wb') as f:
    pickle.dump(val_image_features, f)



In [None]:
test_image_features = extract_img_features(test_filenames)
with open('/content/drive/MyDrive/test_image_features_vgg16.pkl', 'wb') as f:
    pickle.dump(test_image_features, f)

