# Loading the Image dataset

# GOAL
## Prepare MY annotation file in a similar format as the coco captioning file (coco_karpathy_train.json), and create a dataset following coco_karpathy_dataset.py.

In [63]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.image as mpimg
import os
import re
from sklearn.model_selection import train_test_split

from PIL import Image
import json

In [2]:
pip install --upgrade pip

Collecting pip
  Downloading pip-22.1.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-22.1.1
Note: you may need to restart the kernel to use updated packages.


In [64]:
# To read all the images
def load_images(folder):
    images = []
    image_id = []
    for filename in os.listdir(folder):
        img = os.path.join(folder, filename)
        if img is not None:
            images.append(img)
            # Add the name of image as well while removing any .jpg
            image_id.append(filename[:len(filename)-4])
    
    return images, image_id

In [65]:
images, filenames = load_images("/Users/michaelmbajwa/Repos/BLIP/screenshots")

In [66]:
print("There are", len(images), "image files.")

There are 1460 image files.


# Loading the image descriptions dataset
### The dataset was manually cleaned. Wrong captions were manually removed.

In [67]:
# The above noisy data were gotten by manually studying the dataset.
noisy_captions = [
    'A search engine.',
    'An app with some questions.',
    'This is a screenshot taken by a chat.',
    'its something like news page.',
    'This should be a screenshot taken by a game.',
    'The text doesnt match whats happening on screen or give any information on what the app does or how to use it.',
    'This looks like the log on screen to some sort of device or app. Not sure whether this 100% the login screen for a computer or not.',
    'tells me what i need to do.',
    'Traveling app.',
    'It shows economic news.',
    'A Blog.',
    'Another APP in non English so I have no idea what is going on.',
    'The description appears to describe the app accurately.  I can see tv show icons to click on and presumably view.',
    'very simple home page.',
    'simple and easy to understand startup page.',
    'dont understand the language.',
    'the app is for cosmetics.',
    'It shows an online shop.',
    'Seems to be a screenshot of a media news website.',
    'a screenshot of an Asian app selling Asian products.',
    'This app turns light the screen when the screenshot has been taken.',
    'A screenshot of an apps ;About; tab.',
    'its the main screen for a news app.',
    'Settings of certain app.',
    'Its pretty easy market app.',
    'Personal settings details.',
    'it looks like settings.',
    'screenshot of a travel app.',
    'The app for searching new home.',
    'Translation App.',
    'the screenshot looks like a social media app.',
    'screenshot of an app where you can download and read books.',
    'Screenshot looks to be a live webcam of a city. Description doesnt identify this and does not go into any details.',
    'I dont understand how the description thinks this is a news screen. I have no idea how to use the app based on the description.',
    'The screenshot shows that the app is some kind of game but it is very unclear and gives no information about it. It doesnt show you how to use it.',
    'This is clear that it is a screenshot of a mobile phone.',
    'its the main screen for a sports news app.',
    'a screenshot of an app on a phone.',
    'The design is nice and looks to show what is in the app.',
    'This is informative in what the screenshot shows and also how to gain access to the app.',
    'Social media app.',
    'screenshot of some game.',
    'The screenshot is very clear and explains what the app does and is easy to understand.',
    'the app screenshot really  looks like it hard to navigate with so little information.',
    'This may be a screenshot taken by an e-commerce website.',
    'This screenshot should be taken by a sport news app.',
    'The screenshot shows a menu on an app but this does not explain what the app is about or how you can use it.',
    'It highlights the wrong information. It didnt cativate my attention.',
    'Streaming service app.',
    'Seems like a study organization app.',
    'That screen looks like usual dating app.',
    'An app for watching TV shows.',
    'In this screenshot you clearly know to do.',
    'It depicts various teams.',
    'standard logon form.',
    'A Clothing app.',
    'its just a speaker.',
    'The app looks like it could be a doujinshi app for anime.',
    'Screenshot of a news app.',
    'It literally explains all you have to know.',
    'it shows some shopping site/app.',
    'This screenshot should be taken by an e-commerce website or app, showing different options.',
    'The description describes a settings app which from the picture, appears to be the case.',
    'The content describes what the fairs would be in relation to booking a trip, but offers little in the way of information on how to use the app.',
    'Different tabs of businesses are shown in the screenshot but because the description is ;Caption not found,; its difficult to interpret the purpose of the app.',
    'News app.',
    'Sign in screenshot for Nest App.',
    'It shows you car to buy.',
    'App with input screens that you hew to fill out.',
    'Interface to recharge phone.',
    'An app to sign up for the gym.',
    'The description tells you what the app will do, but not how to use it. It does look easy to use.',
    'Its a website that sells clothing. Its hard to tell if the icons on the left just categorize the products, or is indeed some sort of tutorial.',
    'I know the screenshot is taken from a smart phone but the screenshot doesnt tell me anything about the app or how I could use it.',
    'Its shows question and you have answers down.',
    'a setting menu from a phone.',
    'Its actually a news app, but it more like a music/fashion news app.',
    'list of possible push types.',
    'select a pack for gta but cant really see.',
    'Shows graphics of game.',
    'Its an app for multiple food places to order from.',
    'it looks like a well thought out login page.',
    'regular app login with background image.',
    'doge meme maker.',
    'An app designed to learn video editing and other similar technologies.',
    'Game for children.',
    'its a logging menu.',
    'The screenshot is clear which categories you wish to select, the description is clear.',
    'It is of an app that helps you find places to rent.',
    'The screenshot is of a house renting app.',
    'Wallet app screenshot.',
    'The body workout application.',
    'Mayo Clinic, LogIn interface.',
    'There is some sports app.',
    'Its very good app to check your weight. Its easy to use and to understand.',
    'It describes the elements on the screen, but not whats theyre about.',
    'A list .',
    'Just a screenshot of a measuring app.',
    'Its a photoshopped screenshot of a phone in a screenshot of an app that says Advanced call log with location.',
    'It is a screen shot of a cell phone like the description says. The written description doesnt explain how to use the app but the photo shows me where to enter my username and password and how to log in.',
    'The description is relatively sparse and uninformative. The screenshot content is very clear and it is obvious what action needs to be taken to move forward with the page/app process.',
    'Pop up screen to help identify the location of a VIN. The description appears to explain what the screenshot is showing but no actions. Terminology is used that i do not recognise.',
    'Looks like stickers, no description on how to use this. Just pictures or something I can interact with? It talks about the categories but doesnt really help me understand what to do/use.',
    'The description is clear and invites you to click one of the other tabs along the top of the screen. There might be more detailed information with regard to what to do next.',
    'We know its a screenshot of a cell phone. But we are supposed to discuss what its about! This is for trying to find a car you would like.',
    'It is a app that suply us with all type off food needs that we can have, it has a interesting layout and a really good original idea on the bottom showing all the recent places we have viewed.',
    'page from where the user is redirected to his internet provider to log in.',
    'Seems to be partially in a different currency/language, dont think its really accurate, unless this is a user profile. Seems to be recharging not really a profile screen, no text button in middle, only recharge, it shows info but idk if its about user.',
    'I think this app will redirect to log in website, this also can be canceled.',
    'Again, no use describing the app in a foreign language when the user is English. The screenshot looks relatively clear to use and well describes the property on offer. Allowing you to check availability at the bottom is a bonus.',
    'It is a screenshot of the definitions.',
     'Contents: The screenshot doesnt show much information to know what the app is for. Actions:The screenshot absolutely does not help in understanding what the app is for.',
    'The description appears to be in Latin, which, I do not understand.',
    'There are no photos, only text. This is something to do with locations.',
    'It shows your favorite ads.',
    'It tells you how to log in, but not what it does.',
    'It shows the favourite ads.',
    'Brief explanation. Provides no detail to help the user.',
    'Music Charts.',
    'a screenshot of a facebook app.',
    'It is informative screen of some app.',
    'A post by the user ;Broken; posted yesterday.',
    'The description just says that its a screenshot. Theres no instructions.',
    'It is profile screen of some product.',
    'Its an editor app but it is not very intuitive or easy.',
    'It looks like you put your postcode in and search, but there is no description to tell you this.',
    'List of Country names in their native language.',
    'This is something to do with areas, such as Dubai.',
    'It shows lots of different language option for you to select.',
    'an app showing plenty country.',
    'It describes how talkative a person is.',
    'The description reads ;Caption not found;. There is no relation to the app screenshot.',
    'The large text on the left of the image is the topic in the center. The text below provides information on this.',
    'Simple and easy to understand description that tells you exactly what the screenshot shows.',
    'The description appears to be describing what I see in the picture, a music app.',
    'The content looks very clear and the description, although not in the style of English I would expect, is relatively easy to understand.',
    'A screenshot of the Hoodle app.',
    'A skin of a reddit browsing app.',
    'n/a, this screenshot is unknown makes no sence.',
    'It is accurfate up to a point. The only thing it does not help me understand is the part where you can click the image.',
    'This is some news app, it got lots of advertisements and its not usful.',
    'The screenshot shows a bit about what the app does but does not give lots of information and doesnt explain how to use it.',
    'Theres something of information app.',
    'Cellphone camera, all black.',
    'looks like a broken web page.',
    'I partially agree with the description as screen shows a list of elements, typically arranged in rows, but no large text button component ubicated at the top part of the screen.',
    'A list of stories in natural order. I do not know what natural order is? How would I change the order if I wanted to do so? The description is not good. I have no idea what it means.',
    'The screenshot is described well but there is not any additional information on how to use the app.',
    'The description only says ;a screenshot of a cellphone;, no description of the app.',
    'You can see some rows of Arabian which i cant understand and bottom there is an add.',
    'Probably the screenshot of cell phone.',
    'Part of a tutorial.',
    'Background isnt fits with this blue rectangles.',
    'we can have the inspiration short stories.',
    'Screenshot from messaging app, it has clear design.'
]

In [68]:
summaries = pd.read_table("human-descriptions.csv", names=["Main"], encoding="windows-1252")
summaries

Unnamed: 0,Main
0,45525\tAn app which allows you to purchase all...
1,56155\tThe app looks like a list app with diff...
2,14585\tA screenshot of a menu showing access t...
3,"686\tA word with multiple choices of meaning,,..."
4,36664\tBreastfeeding tracker for new born babi...
...,...
1807,44150\tIts some kind of language learning app....
1808,56523\tThe screen is the first page of the app...
1809,46762\tIts some kind of map app where you can ...
1810,31564\tIt&#39;s some kind of a date app. It ha...


In [69]:
# Split the Main column into two based on delimiter
summaries[["image_id", "caption"]] = summaries["Main"].str.split("\t", expand=True)
summaries.drop(columns="Main", inplace=True)
summaries

Unnamed: 0,image_id,caption
0,45525,An app which allows you to purchase all your b...
1,56155,The app looks like a list app with different f...
2,14585,A screenshot of a menu showing access to the p...
3,686,"A word with multiple choices of meaning,,,,,,"
4,36664,Breastfeeding tracker for new born babies and ...
...,...,...
1807,44150,Its some kind of language learning app. There ...
1808,56523,The screen is the first page of the app that a...
1809,46762,Its some kind of map app where you can check y...
1810,31564,It&#39;s some kind of a date app. It has purpl...


In [70]:
# Remove noisy characters from all strings. These noisy characters were observed during data cleaning
def remove_char(value):
    new_value = re.sub(r"&#39;|[¬¥]|&#34|,Äô|â€™|Â|€™|Ã§|Ã¨|&#39", "", value)
    new_value2 = re.sub(r"<br>", " ", new_value)

    i = 1
    while new_value[-(i+1)] == ",":
        i += 1

    final_value = new_value2[:-i]

    # Ensure every caption ends with an appropraite punctuation
    if final_value[-1] in [".", ":", ";"]:
        return final_value
    else:
        return final_value + "."

summaries["caption"] = summaries["caption"].map(remove_char)
summaries

Unnamed: 0,image_id,caption
0,45525,An app which allows you to purchase all your b...
1,56155,The app looks like a list app with different f...
2,14585,A screenshot of a menu showing access to the p...
3,686,A word with multiple choices of meaning.
4,36664,Breastfeeding tracker for new born babies and ...
...,...,...
1807,44150,Its some kind of language learning app. There ...
1808,56523,The screen is the first page of the app that a...
1809,46762,Its some kind of map app where you can check y...
1810,31564,Its some kind of a date app. It has purple the...


In [71]:
# Remove noisy captions
summaries = summaries[~summaries["caption"].isin(noisy_captions)].reset_index().drop(columns="index")
# summaries.to_csv("final_data.csv", index=False)
summaries

Unnamed: 0,image_id,caption
0,45525,An app which allows you to purchase all your b...
1,56155,The app looks like a list app with different f...
2,14585,A screenshot of a menu showing access to the p...
3,686,A word with multiple choices of meaning.
4,36664,Breastfeeding tracker for new born babies and ...
...,...,...
1651,44150,Its some kind of language learning app. There ...
1652,56523,The screen is the first page of the app that a...
1653,46762,Its some kind of map app where you can check y...
1654,31564,Its some kind of a date app. It has purple the...


In [72]:
summaries.nunique(axis=0)

# You will observe we have images with different types of descriptions.

image_id     269
caption     1652
dtype: int64

### We have 269 unique images with varying descriptions.
### Not all the images have descriptions. So, I will split the dataset into two.
### The training dataset will be for those whose images have descriptions while the test dataset will be the images without descriptions.

In [73]:
# Create a dataframe from the collected imagefiles and their respective names
images_dict = {"image_id": filenames, "image": images}
images_df = pd.DataFrame(images_dict)
images_df

Unnamed: 0,image_id,image
0,24090,/Users/michaelmbajwa/Repos/BLIP/screenshots/24...
1,32802,/Users/michaelmbajwa/Repos/BLIP/screenshots/32...
2,5653,/Users/michaelmbajwa/Repos/BLIP/screenshots/56...
3,2128,/Users/michaelmbajwa/Repos/BLIP/screenshots/21...
4,71708,/Users/michaelmbajwa/Repos/BLIP/screenshots/71...
...,...,...
1455,44755,/Users/michaelmbajwa/Repos/BLIP/screenshots/44...
1456,10304,/Users/michaelmbajwa/Repos/BLIP/screenshots/10...
1457,38488,/Users/michaelmbajwa/Repos/BLIP/screenshots/38...
1458,27594,/Users/michaelmbajwa/Repos/BLIP/screenshots/27...


In [74]:
# All images are unique
images_df["image_id"].nunique()

1460

### We have 1460 unique images

In [75]:
# The training dataframe contains images and corresponding captions
mainDf = pd.merge(summaries, images_df, on="image_id")

# Some captions are repeated word for word. In that case, we would like to drop duplicates
mainDf.drop_duplicates(["caption"], inplace=True)

mainDf = mainDf.reset_index()
mainDf.drop(columns="index", inplace=True)
mainDf

Unnamed: 0,image_id,caption,image
0,45525,An app which allows you to purchase all your b...,/Users/michaelmbajwa/Repos/BLIP/screenshots/45...
1,45525,the images are across the whole of the backgro...,/Users/michaelmbajwa/Repos/BLIP/screenshots/45...
2,45525,An app for beauty-related purchases such as sk...,/Users/michaelmbajwa/Repos/BLIP/screenshots/45...
3,45525,It is a screenshot of a shopping site and has ...,/Users/michaelmbajwa/Repos/BLIP/screenshots/45...
4,56155,The app looks like a list app with different f...,/Users/michaelmbajwa/Repos/BLIP/screenshots/56...
...,...,...,...
1647,501,Page to log in to Podbean using your Facebook ...,/Users/michaelmbajwa/Repos/BLIP/screenshots/50...
1648,46762,An app for making routes and how much time it ...,/Users/michaelmbajwa/Repos/BLIP/screenshots/46...
1649,46762,Its some kind of map app where you can check y...,/Users/michaelmbajwa/Repos/BLIP/screenshots/46...
1650,30081,It is a app that offer us a lot off payment me...,/Users/michaelmbajwa/Repos/BLIP/screenshots/30...


In [76]:
mainDf.caption.str.len().max()

348

## Split the datasets into test, validation and training samples

In [77]:
unique_IDs = mainDf["image_id"].unique()
temp_trainingIDs, testIDs = train_test_split(unique_IDs, test_size=0.11, random_state=42)
trainingIDs, validationIDs = train_test_split(temp_trainingIDs, test_size=0.11, random_state=42)

In [78]:
# How many captions do we have per image
cnt_num = mainDf.groupby(["image_id"]).count()
mean_num = mainDf.groupby(["image_id"]).count()["caption"].mean()
min_num = mainDf.groupby(["image_id"]).count()["caption"].min()
max_num = mainDf.groupby(["image_id"]).count()["caption"].max()

print("The mean number of captions generated per image is", mean_num.__ceil__())
print("The minimum number of captions generated per image is", min_num)
print("The maximum number of captions generated per image is", max_num)

The mean number of captions generated per image is 7
The minimum number of captions generated per image is 2
The maximum number of captions generated per image is 15


In [79]:
# Prepare training_datasets according to the coco_format
trainingDF = mainDf[mainDf["image_id"].isin(trainingIDs)].reset_index().drop(columns="index")

def list_captions(col):
    final = []
    for val in col["caption"].values:
        final.append(val)
    
    return sorted(final)


validationDF = mainDf[mainDf["image_id"].isin(validationIDs)].reset_index().drop(columns=["index"]).groupby(["image", "image_id"]).apply(list_captions).reset_index(name="caption")
testingDF = mainDf[mainDf["image_id"].isin(testIDs)].reset_index().drop(columns=["index"]).groupby(["image", "image_id"]).apply(list_captions).reset_index(name="caption")

In [80]:
# My own version of coco_karpathy_val_gt.json

final_result = {}
val2 = mainDf[mainDf["image_id"].isin(validationIDs)].reset_index().rename(columns={"index": "id"}).drop(columns=["image"])
val_gt = val2.astype({'image_id': "int64"})
val_gt['id'] = val_gt['id'] + 47777
final_result["annotations"] = val_gt[['image_id', 'caption', 'id']].sort_values(by=['image_id', 'caption']).to_dict('records')

final_result["images"] = val_gt[['id']].to_dict('records')
final_result

with open("validation_gt.json", "w") as outfile:
    json.dump(final_result, outfile)

In [81]:
# My own version of coco_karpathy_test_gt.json
final_result2 = {}
val_test = mainDf[mainDf["image_id"].isin(testIDs)].reset_index().rename(columns={"index": "id"}).drop(columns=["image"])
val_test_gt = val_test.astype({'image_id': "int64"})
val_test_gt['id'] = val_test_gt['id'] + 47777

final_result2["annotations"] = val_test_gt[['image_id', 'caption', 'id']].sort_values(by=['image_id', 'caption']).to_dict('records')
final_result2["images"] = val_test_gt[['id']].to_dict('records')
final_result2

with open("test_gt.json", "w") as outfile:
    json.dump(final_result2, outfile)

In [82]:
# Save final files
trainingDF.to_json("train.json",orient='records')
validationDF.to_json("validation.json",orient='records')
testingDF.to_json("testing.json",orient='records')

In [3]:
import torch
import ruamel.yaml as yaml
import json
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode


In [None]:
json.load(open("train.json", 'r'))

In [15]:
int('00000345')

345

In [None]:
# python3 train_caption.py