# Natural Language Processing and the Web WS 2021/22 - Practice Class: 

# Project on Multimodal Image Processing

## Names: Luca Ciegelski, Luka Zdravkovic and Lennart Bengtson

## Preparing the coco Dataset https://cocodataset.org/#home

### Step 1: Download the dataset

To use the coco dataset funktionality of pytorch [doku here](https://pytorch.org/vision/stable/datasets.html#coco) teh coco api needs to be installed. For this run the makefile from the coco api github like this:

In [1]:
%%bash
ls coco > /dev/null || git clone https://github.com/pdollar/coco/
cd coco/PythonAPI
make
python setup.py install
cd ../..
rm -rf coco

python setup.py build_ext --inplace
running build_ext
cythoning pycocotools/_mask.pyx to pycocotools/_mask.c
building 'pycocotools._mask' extension
creating build
creating build/common
creating build/temp.linux-x86_64-3.9
creating build/temp.linux-x86_64-3.9/pycocotools
gcc -pthread -B /home/luka/anaconda3/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/luka/anaconda3/include -I/home/luka/anaconda3/include -fPIC -O2 -isystem /home/luka/anaconda3/include -fPIC -I/home/luka/anaconda3/lib/python3.9/site-packages/numpy/core/include -I../common -I/home/luka/anaconda3/include/python3.9 -c ../common/maskApi.c -o build/temp.linux-x86_64-3.9/../common/maskApi.o -Wno-cpp -Wno-unused-function -std=c99
gcc -pthread -B /home/luka/anaconda3/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/luka/anaconda3/include -I/home/luka/anaconda3/include -fPIC -O2 -isystem /home/luka/anaconda3/include -fPIC -I/home/luka/an

ls: cannot access 'coco': No such file or directory
Cloning into 'coco'...
  tree = Parsing.p_module(s, pxd, full_module_name)
../common/maskApi.c: In function ‘rleDecode’:
   46 |       for( k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
      |       ^~~
../common/maskApi.c:46:49: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘for’
   46 |       for( k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
      |                                                 ^
../common/maskApi.c: In function ‘rleFrPoly’:
  166 |   for(j=0; j<k; j++) x[j]=(int)(scale*xy[j*2+0]+.5); x[k]=x[0];
      |   ^~~
../common/maskApi.c:166:54: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘for’
  166 |   for(j=0; j<k; j++) x[j]=(int)(scale*xy[j*2+0]+.5); x[k]=x[0];
      |                                                      ^
  167 |   for(j=0; j<k; j++) y[j]=(int)(scale*xy[j*2+1]+.5); y[k]=y[0];
      |   ^~~
../common/mas

In [10]:
%%bash
mkdir -p coco_dataset/features

echo 'preparing dirs and files'
cd coco_dataset
touch photo_data.csv

echo 'downloading train2017.zip...'
aria2c -x 10 -j 10 http://images.cocodataset.org/zips/train2017.zip
echo 'downloading val2017.zip...'
aria2c -x 10 -j 10 http://images.cocodataset.org/zips/val2017.zip
echo 'unziping files'
unzip \*.zip

echo 'moving all photos into one dir'
mv train2017/ photos/
mv val2017/* photos/
rmdir val2017

echo 'downloading annotations_trainval2017.zip...'
aria2c -x 10 -j 10 http://images.cocodataset.org/annotations/annotations_trainval2017.zip
echo 'unziping files'
unzip \*.zip
rm val2017.zip
rm train2017.zip
rm annotations_trainval2017.zip

rm: cannot remove 'val2017.zip': No such file or directory
rm: cannot remove 'train2017.zip': No such file or directory


### Step 2: Process the dataset using the model

In [11]:
# imports
import os
import math
import numpy as np
import pandas as pd
import csv
import json

import torch
from PIL import Image
from IPython.display import Image as im
from transformers import CLIPProcessor, CLIPModel

In [12]:
# set the path to the dataset
cwd = os.getcwd()
dataset_path = os.path.join(cwd, "coco_dataset")

# Set the path to the photos
photos_path = os.path.join(dataset_path, "photos")

# List all JPGs in the folder
# photos_files = list(photos_path.glob("*.jpg"))
photos_files = [os.path.join(photos_path, f) for f in os.listdir(photos_path) if f.endswith('.jpg')]

# Print some statistics
print(f"Photos found: {len(photos_files)}")

Photos found: 123287


In [13]:
# Load the open CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Function that computes the feature vectors for a batch of images
def compute_clip_features(photos_batch):
    # Load all the photos from the files
    photos = [Image.open(photo_file) for photo_file in photos_batch]
    
    # Preprocess all photos
    photos_preprocessed = processor(images=photos, return_tensors='pt').to(device)
    # print(photos_preprocessed)

    with torch.no_grad():
        # Encode the photos batch to compute the feature vectors and normalize them
        photos_features = model.get_image_features(**photos_preprocessed)
        photos_features /= photos_features.norm(dim=-1, keepdim=True)

    # Transfer the feature vectors back to the CPU and convert to numpy
    return photos_features.cpu().numpy()

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [14]:
# Define the batch size so that it fits on your GPU. You can also do the processing on the CPU, but it will be slower.
batch_size = 16

# Path where the feature vectors will be stored
features_path = os.path.join(dataset_path, "features")

# Compute how many batches are needed
batches = math.ceil(len(photos_files) / batch_size)

# Process each batch
for i in range(batches):
    print(f"Processing batch {i+1}/{batches}")

    batch_ids_path = os.path.join(features_path, f"{i:010d}.csv")
    batch_features_path = os.path.join(features_path, f"{i:010d}.npy")
    
    # Only do the processing if the batch wasn't processed yet
    if not os.path.isfile(batch_features_path):
        try:
            # Select the photos for the current batch
            batch_files = photos_files[i*batch_size : (i+1)*batch_size]

            # Compute the features and save to a numpy file
            batch_features = compute_clip_features(batch_files)
            np.save(batch_features_path, batch_features)

            # Save the photo IDs to a CSV file
            photo_ids = [os.path.splitext(os.path.basename(photo_file))[0] for photo_file in batch_files]
            photo_ids_data = pd.DataFrame(photo_ids, columns=['photo_id'])
            photo_ids_data.to_csv(batch_ids_path, index=False)
        except:
            # Catch problems with the processing to make the process more robust
            print(f'Problem with batch {i}')

Processing batch 1/7706
Processing batch 2/7706
Problem with batch 1
Processing batch 3/7706
Processing batch 4/7706
Processing batch 5/7706
Processing batch 6/7706
Processing batch 7/7706
Processing batch 8/7706
Processing batch 9/7706
Processing batch 10/7706
Processing batch 11/7706
Problem with batch 10
Processing batch 12/7706
Processing batch 13/7706
Processing batch 14/7706
Processing batch 15/7706
Processing batch 16/7706
Processing batch 17/7706
Processing batch 18/7706
Problem with batch 17
Processing batch 19/7706
Processing batch 20/7706
Processing batch 21/7706
Processing batch 22/7706
Processing batch 23/7706
Processing batch 24/7706
Processing batch 25/7706
Processing batch 26/7706
Processing batch 27/7706
Processing batch 28/7706
Processing batch 29/7706
Processing batch 30/7706
Processing batch 31/7706
Processing batch 32/7706
Processing batch 33/7706
Processing batch 34/7706
Processing batch 35/7706
Processing batch 36/7706
Processing batch 37/7706
Processing batch 38

Processing batch 308/7706
Processing batch 309/7706
Processing batch 310/7706
Processing batch 311/7706
Processing batch 312/7706
Processing batch 313/7706
Processing batch 314/7706
Processing batch 315/7706
Processing batch 316/7706
Processing batch 317/7706
Processing batch 318/7706
Processing batch 319/7706
Problem with batch 318
Processing batch 320/7706
Processing batch 321/7706
Processing batch 322/7706
Processing batch 323/7706
Processing batch 324/7706
Processing batch 325/7706
Processing batch 326/7706
Processing batch 327/7706
Processing batch 328/7706
Processing batch 329/7706
Processing batch 330/7706
Processing batch 331/7706
Processing batch 332/7706
Processing batch 333/7706
Processing batch 334/7706
Processing batch 335/7706
Processing batch 336/7706
Processing batch 337/7706
Processing batch 338/7706
Problem with batch 337
Processing batch 339/7706
Processing batch 340/7706
Processing batch 341/7706
Processing batch 342/7706
Processing batch 343/7706
Processing batch 3

Processing batch 611/7706
Processing batch 612/7706
Processing batch 613/7706
Processing batch 614/7706
Processing batch 615/7706
Processing batch 616/7706
Processing batch 617/7706
Processing batch 618/7706
Processing batch 619/7706
Processing batch 620/7706
Processing batch 621/7706
Problem with batch 620
Processing batch 622/7706
Processing batch 623/7706
Processing batch 624/7706
Processing batch 625/7706
Processing batch 626/7706
Processing batch 627/7706
Processing batch 628/7706
Processing batch 629/7706
Processing batch 630/7706
Processing batch 631/7706
Processing batch 632/7706
Processing batch 633/7706
Processing batch 634/7706
Processing batch 635/7706
Processing batch 636/7706
Processing batch 637/7706
Processing batch 638/7706
Processing batch 639/7706
Processing batch 640/7706
Processing batch 641/7706
Processing batch 642/7706
Processing batch 643/7706
Processing batch 644/7706
Processing batch 645/7706
Processing batch 646/7706
Processing batch 647/7706
Processing batc

Processing batch 918/7706
Processing batch 919/7706
Processing batch 920/7706
Processing batch 921/7706
Processing batch 922/7706
Processing batch 923/7706
Processing batch 924/7706
Processing batch 925/7706
Processing batch 926/7706
Processing batch 927/7706
Processing batch 928/7706
Processing batch 929/7706
Processing batch 930/7706
Processing batch 931/7706
Processing batch 932/7706
Processing batch 933/7706
Processing batch 934/7706
Processing batch 935/7706
Processing batch 936/7706
Processing batch 937/7706
Processing batch 938/7706
Processing batch 939/7706
Processing batch 940/7706
Processing batch 941/7706
Processing batch 942/7706
Processing batch 943/7706
Processing batch 944/7706
Processing batch 945/7706
Processing batch 946/7706
Processing batch 947/7706
Processing batch 948/7706
Processing batch 949/7706
Processing batch 950/7706
Processing batch 951/7706
Processing batch 952/7706
Processing batch 953/7706
Processing batch 954/7706
Processing batch 955/7706
Processing b

Processing batch 1218/7706
Processing batch 1219/7706
Processing batch 1220/7706
Processing batch 1221/7706
Processing batch 1222/7706
Processing batch 1223/7706
Processing batch 1224/7706
Processing batch 1225/7706
Processing batch 1226/7706
Processing batch 1227/7706
Processing batch 1228/7706
Processing batch 1229/7706
Processing batch 1230/7706
Processing batch 1231/7706
Processing batch 1232/7706
Processing batch 1233/7706
Processing batch 1234/7706
Problem with batch 1233
Processing batch 1235/7706
Processing batch 1236/7706
Processing batch 1237/7706
Processing batch 1238/7706
Processing batch 1239/7706
Processing batch 1240/7706
Processing batch 1241/7706
Processing batch 1242/7706
Processing batch 1243/7706
Processing batch 1244/7706
Processing batch 1245/7706
Processing batch 1246/7706
Processing batch 1247/7706
Processing batch 1248/7706
Processing batch 1249/7706
Processing batch 1250/7706
Processing batch 1251/7706
Processing batch 1252/7706
Processing batch 1253/7706
Proc

Processing batch 1513/7706
Processing batch 1514/7706
Processing batch 1515/7706
Processing batch 1516/7706
Processing batch 1517/7706
Processing batch 1518/7706
Processing batch 1519/7706
Processing batch 1520/7706
Processing batch 1521/7706
Processing batch 1522/7706
Processing batch 1523/7706
Processing batch 1524/7706
Processing batch 1525/7706
Processing batch 1526/7706
Processing batch 1527/7706
Processing batch 1528/7706
Processing batch 1529/7706
Processing batch 1530/7706
Processing batch 1531/7706
Processing batch 1532/7706
Processing batch 1533/7706
Processing batch 1534/7706
Processing batch 1535/7706
Processing batch 1536/7706
Processing batch 1537/7706
Processing batch 1538/7706
Processing batch 1539/7706
Processing batch 1540/7706
Processing batch 1541/7706
Processing batch 1542/7706
Processing batch 1543/7706
Processing batch 1544/7706
Processing batch 1545/7706
Processing batch 1546/7706
Processing batch 1547/7706
Processing batch 1548/7706
Processing batch 1549/7706
P

Processing batch 1806/7706
Processing batch 1807/7706
Processing batch 1808/7706
Processing batch 1809/7706
Processing batch 1810/7706
Processing batch 1811/7706
Processing batch 1812/7706
Processing batch 1813/7706
Processing batch 1814/7706
Processing batch 1815/7706
Processing batch 1816/7706
Processing batch 1817/7706
Processing batch 1818/7706
Processing batch 1819/7706
Processing batch 1820/7706
Processing batch 1821/7706
Processing batch 1822/7706
Processing batch 1823/7706
Processing batch 1824/7706
Processing batch 1825/7706
Processing batch 1826/7706
Processing batch 1827/7706
Processing batch 1828/7706
Processing batch 1829/7706
Processing batch 1830/7706
Processing batch 1831/7706
Processing batch 1832/7706
Processing batch 1833/7706
Processing batch 1834/7706
Processing batch 1835/7706
Processing batch 1836/7706
Processing batch 1837/7706
Processing batch 1838/7706
Processing batch 1839/7706
Processing batch 1840/7706
Processing batch 1841/7706
Processing batch 1842/7706
P

Processing batch 2101/7706
Processing batch 2102/7706
Processing batch 2103/7706
Processing batch 2104/7706
Processing batch 2105/7706
Processing batch 2106/7706
Processing batch 2107/7706
Processing batch 2108/7706
Processing batch 2109/7706
Processing batch 2110/7706
Processing batch 2111/7706
Processing batch 2112/7706
Processing batch 2113/7706
Processing batch 2114/7706
Processing batch 2115/7706
Processing batch 2116/7706
Processing batch 2117/7706
Processing batch 2118/7706
Processing batch 2119/7706
Processing batch 2120/7706
Processing batch 2121/7706
Processing batch 2122/7706
Processing batch 2123/7706
Processing batch 2124/7706
Processing batch 2125/7706
Processing batch 2126/7706
Processing batch 2127/7706
Processing batch 2128/7706
Processing batch 2129/7706
Processing batch 2130/7706
Processing batch 2131/7706
Processing batch 2132/7706
Processing batch 2133/7706
Processing batch 2134/7706
Processing batch 2135/7706
Processing batch 2136/7706
Processing batch 2137/7706
P

Processing batch 2400/7706
Processing batch 2401/7706
Processing batch 2402/7706
Processing batch 2403/7706
Processing batch 2404/7706
Processing batch 2405/7706
Processing batch 2406/7706
Processing batch 2407/7706
Processing batch 2408/7706
Processing batch 2409/7706
Processing batch 2410/7706
Processing batch 2411/7706
Processing batch 2412/7706
Processing batch 2413/7706
Processing batch 2414/7706
Processing batch 2415/7706
Processing batch 2416/7706
Processing batch 2417/7706
Processing batch 2418/7706
Processing batch 2419/7706
Processing batch 2420/7706
Processing batch 2421/7706
Processing batch 2422/7706
Processing batch 2423/7706
Processing batch 2424/7706
Processing batch 2425/7706
Processing batch 2426/7706
Processing batch 2427/7706
Processing batch 2428/7706
Processing batch 2429/7706
Processing batch 2430/7706
Processing batch 2431/7706
Processing batch 2432/7706
Processing batch 2433/7706
Processing batch 2434/7706
Processing batch 2435/7706
Processing batch 2436/7706
P

Processing batch 2702/7706
Processing batch 2703/7706
Processing batch 2704/7706
Processing batch 2705/7706
Processing batch 2706/7706
Processing batch 2707/7706
Processing batch 2708/7706
Problem with batch 2707
Processing batch 2709/7706
Processing batch 2710/7706
Processing batch 2711/7706
Processing batch 2712/7706
Processing batch 2713/7706
Processing batch 2714/7706
Processing batch 2715/7706
Problem with batch 2714
Processing batch 2716/7706
Processing batch 2717/7706
Processing batch 2718/7706
Problem with batch 2717
Processing batch 2719/7706
Processing batch 2720/7706
Processing batch 2721/7706
Processing batch 2722/7706
Problem with batch 2721
Processing batch 2723/7706
Processing batch 2724/7706
Processing batch 2725/7706
Processing batch 2726/7706
Processing batch 2727/7706
Processing batch 2728/7706
Processing batch 2729/7706
Processing batch 2730/7706
Processing batch 2731/7706
Processing batch 2732/7706
Processing batch 2733/7706
Processing batch 2734/7706
Processing ba

Processing batch 2990/7706
Processing batch 2991/7706
Processing batch 2992/7706
Processing batch 2993/7706
Processing batch 2994/7706
Processing batch 2995/7706
Processing batch 2996/7706
Processing batch 2997/7706
Processing batch 2998/7706
Problem with batch 2997
Processing batch 2999/7706
Processing batch 3000/7706
Processing batch 3001/7706
Processing batch 3002/7706
Processing batch 3003/7706
Processing batch 3004/7706
Processing batch 3005/7706
Processing batch 3006/7706
Processing batch 3007/7706
Processing batch 3008/7706
Processing batch 3009/7706
Processing batch 3010/7706
Processing batch 3011/7706
Processing batch 3012/7706
Processing batch 3013/7706
Processing batch 3014/7706
Processing batch 3015/7706
Processing batch 3016/7706
Processing batch 3017/7706
Processing batch 3018/7706
Processing batch 3019/7706
Processing batch 3020/7706
Processing batch 3021/7706
Processing batch 3022/7706
Processing batch 3023/7706
Problem with batch 3022
Processing batch 3024/7706
Process

Processing batch 3290/7706
Processing batch 3291/7706
Processing batch 3292/7706
Processing batch 3293/7706
Processing batch 3294/7706
Processing batch 3295/7706
Processing batch 3296/7706
Processing batch 3297/7706
Processing batch 3298/7706
Processing batch 3299/7706
Processing batch 3300/7706
Problem with batch 3299
Processing batch 3301/7706
Processing batch 3302/7706
Problem with batch 3301
Processing batch 3303/7706
Processing batch 3304/7706
Processing batch 3305/7706
Processing batch 3306/7706
Processing batch 3307/7706
Problem with batch 3306
Processing batch 3308/7706
Processing batch 3309/7706
Processing batch 3310/7706
Processing batch 3311/7706
Processing batch 3312/7706
Processing batch 3313/7706
Processing batch 3314/7706
Processing batch 3315/7706
Processing batch 3316/7706
Processing batch 3317/7706
Processing batch 3318/7706
Processing batch 3319/7706
Processing batch 3320/7706
Processing batch 3321/7706
Processing batch 3322/7706
Processing batch 3323/7706
Processing

Processing batch 3583/7706
Processing batch 3584/7706
Processing batch 3585/7706
Processing batch 3586/7706
Processing batch 3587/7706
Processing batch 3588/7706
Processing batch 3589/7706
Processing batch 3590/7706
Processing batch 3591/7706
Processing batch 3592/7706
Processing batch 3593/7706
Processing batch 3594/7706
Processing batch 3595/7706
Processing batch 3596/7706
Processing batch 3597/7706
Processing batch 3598/7706
Processing batch 3599/7706
Processing batch 3600/7706
Processing batch 3601/7706
Processing batch 3602/7706
Processing batch 3603/7706
Processing batch 3604/7706
Processing batch 3605/7706
Processing batch 3606/7706
Processing batch 3607/7706
Processing batch 3608/7706
Processing batch 3609/7706
Processing batch 3610/7706
Processing batch 3611/7706
Processing batch 3612/7706
Processing batch 3613/7706
Processing batch 3614/7706
Processing batch 3615/7706
Processing batch 3616/7706
Processing batch 3617/7706
Processing batch 3618/7706
Processing batch 3619/7706
P

Processing batch 3884/7706
Processing batch 3885/7706
Processing batch 3886/7706
Problem with batch 3885
Processing batch 3887/7706
Processing batch 3888/7706
Processing batch 3889/7706
Processing batch 3890/7706
Processing batch 3891/7706
Processing batch 3892/7706
Processing batch 3893/7706
Processing batch 3894/7706
Processing batch 3895/7706
Processing batch 3896/7706
Processing batch 3897/7706
Processing batch 3898/7706
Processing batch 3899/7706
Processing batch 3900/7706
Processing batch 3901/7706
Processing batch 3902/7706
Processing batch 3903/7706
Processing batch 3904/7706
Processing batch 3905/7706
Processing batch 3906/7706
Processing batch 3907/7706
Processing batch 3908/7706
Processing batch 3909/7706
Processing batch 3910/7706
Processing batch 3911/7706
Processing batch 3912/7706
Processing batch 3913/7706
Processing batch 3914/7706
Processing batch 3915/7706
Processing batch 3916/7706
Processing batch 3917/7706
Processing batch 3918/7706
Processing batch 3919/7706
Proc

Processing batch 4183/7706
Processing batch 4184/7706
Processing batch 4185/7706
Processing batch 4186/7706
Processing batch 4187/7706
Processing batch 4188/7706
Processing batch 4189/7706
Processing batch 4190/7706
Processing batch 4191/7706
Processing batch 4192/7706
Processing batch 4193/7706
Processing batch 4194/7706
Problem with batch 4193
Processing batch 4195/7706
Processing batch 4196/7706
Processing batch 4197/7706
Processing batch 4198/7706
Processing batch 4199/7706
Processing batch 4200/7706
Processing batch 4201/7706
Processing batch 4202/7706
Processing batch 4203/7706
Processing batch 4204/7706
Processing batch 4205/7706
Processing batch 4206/7706
Processing batch 4207/7706
Processing batch 4208/7706
Processing batch 4209/7706
Processing batch 4210/7706
Processing batch 4211/7706
Processing batch 4212/7706
Processing batch 4213/7706
Processing batch 4214/7706
Processing batch 4215/7706
Processing batch 4216/7706
Processing batch 4217/7706
Processing batch 4218/7706
Proc

Processing batch 4482/7706
Processing batch 4483/7706
Processing batch 4484/7706
Processing batch 4485/7706
Processing batch 4486/7706
Processing batch 4487/7706
Processing batch 4488/7706
Processing batch 4489/7706
Processing batch 4490/7706
Processing batch 4491/7706
Processing batch 4492/7706
Processing batch 4493/7706
Processing batch 4494/7706
Processing batch 4495/7706
Processing batch 4496/7706
Processing batch 4497/7706
Processing batch 4498/7706
Processing batch 4499/7706
Processing batch 4500/7706
Processing batch 4501/7706
Processing batch 4502/7706
Processing batch 4503/7706
Processing batch 4504/7706
Processing batch 4505/7706
Processing batch 4506/7706
Processing batch 4507/7706
Processing batch 4508/7706
Processing batch 4509/7706
Processing batch 4510/7706
Processing batch 4511/7706
Processing batch 4512/7706
Processing batch 4513/7706
Processing batch 4514/7706
Processing batch 4515/7706
Processing batch 4516/7706
Processing batch 4517/7706
Processing batch 4518/7706
P

Processing batch 4778/7706
Processing batch 4779/7706
Processing batch 4780/7706
Processing batch 4781/7706
Processing batch 4782/7706
Processing batch 4783/7706
Processing batch 4784/7706
Processing batch 4785/7706
Processing batch 4786/7706
Processing batch 4787/7706
Processing batch 4788/7706
Processing batch 4789/7706
Problem with batch 4788
Processing batch 4790/7706
Processing batch 4791/7706
Processing batch 4792/7706
Processing batch 4793/7706
Processing batch 4794/7706
Problem with batch 4793
Processing batch 4795/7706
Processing batch 4796/7706
Processing batch 4797/7706
Processing batch 4798/7706
Processing batch 4799/7706
Processing batch 4800/7706
Processing batch 4801/7706
Processing batch 4802/7706
Processing batch 4803/7706
Processing batch 4804/7706
Processing batch 4805/7706
Processing batch 4806/7706
Processing batch 4807/7706
Processing batch 4808/7706
Processing batch 4809/7706
Processing batch 4810/7706
Processing batch 4811/7706
Processing batch 4812/7706
Process

Processing batch 5075/7706
Processing batch 5076/7706
Processing batch 5077/7706
Processing batch 5078/7706
Processing batch 5079/7706
Processing batch 5080/7706
Processing batch 5081/7706
Processing batch 5082/7706
Processing batch 5083/7706
Processing batch 5084/7706
Processing batch 5085/7706
Processing batch 5086/7706
Processing batch 5087/7706
Processing batch 5088/7706
Processing batch 5089/7706
Processing batch 5090/7706
Processing batch 5091/7706
Processing batch 5092/7706
Processing batch 5093/7706
Processing batch 5094/7706
Processing batch 5095/7706
Processing batch 5096/7706
Processing batch 5097/7706
Processing batch 5098/7706
Processing batch 5099/7706
Processing batch 5100/7706
Problem with batch 5099
Processing batch 5101/7706
Processing batch 5102/7706
Problem with batch 5101
Processing batch 5103/7706
Problem with batch 5102
Processing batch 5104/7706
Processing batch 5105/7706
Processing batch 5106/7706
Processing batch 5107/7706
Processing batch 5108/7706
Processing

Processing batch 5367/7706
Processing batch 5368/7706
Processing batch 5369/7706
Processing batch 5370/7706
Processing batch 5371/7706
Processing batch 5372/7706
Processing batch 5373/7706
Processing batch 5374/7706
Processing batch 5375/7706
Processing batch 5376/7706
Processing batch 5377/7706
Processing batch 5378/7706
Processing batch 5379/7706
Processing batch 5380/7706
Processing batch 5381/7706
Processing batch 5382/7706
Processing batch 5383/7706
Processing batch 5384/7706
Processing batch 5385/7706
Processing batch 5386/7706
Problem with batch 5385
Processing batch 5387/7706
Processing batch 5388/7706
Processing batch 5389/7706
Processing batch 5390/7706
Processing batch 5391/7706
Processing batch 5392/7706
Problem with batch 5391
Processing batch 5393/7706
Processing batch 5394/7706
Processing batch 5395/7706
Processing batch 5396/7706
Problem with batch 5395
Processing batch 5397/7706
Processing batch 5398/7706
Processing batch 5399/7706
Processing batch 5400/7706
Processing

Processing batch 5663/7706
Processing batch 5664/7706
Processing batch 5665/7706
Processing batch 5666/7706
Problem with batch 5665
Processing batch 5667/7706
Processing batch 5668/7706
Processing batch 5669/7706
Processing batch 5670/7706
Processing batch 5671/7706
Processing batch 5672/7706
Processing batch 5673/7706
Processing batch 5674/7706
Processing batch 5675/7706
Processing batch 5676/7706
Processing batch 5677/7706
Processing batch 5678/7706
Processing batch 5679/7706
Processing batch 5680/7706
Processing batch 5681/7706
Processing batch 5682/7706
Processing batch 5683/7706
Processing batch 5684/7706
Processing batch 5685/7706
Processing batch 5686/7706
Processing batch 5687/7706
Processing batch 5688/7706
Processing batch 5689/7706
Processing batch 5690/7706
Processing batch 5691/7706
Processing batch 5692/7706
Processing batch 5693/7706
Processing batch 5694/7706
Problem with batch 5693
Processing batch 5695/7706
Processing batch 5696/7706
Processing batch 5697/7706
Process

Processing batch 5957/7706
Processing batch 5958/7706
Processing batch 5959/7706
Problem with batch 5958
Processing batch 5960/7706
Processing batch 5961/7706
Processing batch 5962/7706
Processing batch 5963/7706
Processing batch 5964/7706
Processing batch 5965/7706
Processing batch 5966/7706
Processing batch 5967/7706
Processing batch 5968/7706
Processing batch 5969/7706
Problem with batch 5968
Processing batch 5970/7706
Processing batch 5971/7706
Processing batch 5972/7706
Processing batch 5973/7706
Processing batch 5974/7706
Problem with batch 5973
Processing batch 5975/7706
Problem with batch 5974
Processing batch 5976/7706
Processing batch 5977/7706
Processing batch 5978/7706
Processing batch 5979/7706
Processing batch 5980/7706
Processing batch 5981/7706
Processing batch 5982/7706
Processing batch 5983/7706
Processing batch 5984/7706
Processing batch 5985/7706
Processing batch 5986/7706
Processing batch 5987/7706
Processing batch 5988/7706
Processing batch 5989/7706
Processing ba

Processing batch 6252/7706
Processing batch 6253/7706
Processing batch 6254/7706
Processing batch 6255/7706
Processing batch 6256/7706
Processing batch 6257/7706
Processing batch 6258/7706
Processing batch 6259/7706
Processing batch 6260/7706
Processing batch 6261/7706
Processing batch 6262/7706
Processing batch 6263/7706
Processing batch 6264/7706
Processing batch 6265/7706
Processing batch 6266/7706
Processing batch 6267/7706
Processing batch 6268/7706
Processing batch 6269/7706
Processing batch 6270/7706
Processing batch 6271/7706
Processing batch 6272/7706
Processing batch 6273/7706
Processing batch 6274/7706
Processing batch 6275/7706
Processing batch 6276/7706
Processing batch 6277/7706
Processing batch 6278/7706
Processing batch 6279/7706
Processing batch 6280/7706
Processing batch 6281/7706
Processing batch 6282/7706
Processing batch 6283/7706
Processing batch 6284/7706
Processing batch 6285/7706
Processing batch 6286/7706
Processing batch 6287/7706
Processing batch 6288/7706
P

Problem with batch 6549
Processing batch 6551/7706
Processing batch 6552/7706
Processing batch 6553/7706
Processing batch 6554/7706
Processing batch 6555/7706
Processing batch 6556/7706
Processing batch 6557/7706
Processing batch 6558/7706
Processing batch 6559/7706
Processing batch 6560/7706
Processing batch 6561/7706
Processing batch 6562/7706
Processing batch 6563/7706
Processing batch 6564/7706
Processing batch 6565/7706
Processing batch 6566/7706
Processing batch 6567/7706
Processing batch 6568/7706
Processing batch 6569/7706
Processing batch 6570/7706
Processing batch 6571/7706
Processing batch 6572/7706
Processing batch 6573/7706
Processing batch 6574/7706
Processing batch 6575/7706
Processing batch 6576/7706
Processing batch 6577/7706
Processing batch 6578/7706
Processing batch 6579/7706
Processing batch 6580/7706
Problem with batch 6579
Processing batch 6581/7706
Processing batch 6582/7706
Processing batch 6583/7706
Processing batch 6584/7706
Processing batch 6585/7706
Process

Processing batch 6848/7706
Processing batch 6849/7706
Processing batch 6850/7706
Processing batch 6851/7706
Processing batch 6852/7706
Processing batch 6853/7706
Processing batch 6854/7706
Processing batch 6855/7706
Processing batch 6856/7706
Processing batch 6857/7706
Problem with batch 6856
Processing batch 6858/7706
Problem with batch 6857
Processing batch 6859/7706
Processing batch 6860/7706
Processing batch 6861/7706
Processing batch 6862/7706
Processing batch 6863/7706
Processing batch 6864/7706
Processing batch 6865/7706
Processing batch 6866/7706
Processing batch 6867/7706
Processing batch 6868/7706
Processing batch 6869/7706
Processing batch 6870/7706
Processing batch 6871/7706
Problem with batch 6870
Processing batch 6872/7706
Processing batch 6873/7706
Processing batch 6874/7706
Processing batch 6875/7706
Processing batch 6876/7706
Processing batch 6877/7706
Processing batch 6878/7706
Processing batch 6879/7706
Processing batch 6880/7706
Processing batch 6881/7706
Processing

Processing batch 7145/7706
Processing batch 7146/7706
Processing batch 7147/7706
Processing batch 7148/7706
Processing batch 7149/7706
Processing batch 7150/7706
Processing batch 7151/7706
Problem with batch 7150
Processing batch 7152/7706
Processing batch 7153/7706
Processing batch 7154/7706
Processing batch 7155/7706
Processing batch 7156/7706
Processing batch 7157/7706
Processing batch 7158/7706
Processing batch 7159/7706
Processing batch 7160/7706
Processing batch 7161/7706
Processing batch 7162/7706
Processing batch 7163/7706
Processing batch 7164/7706
Processing batch 7165/7706
Processing batch 7166/7706
Processing batch 7167/7706
Processing batch 7168/7706
Processing batch 7169/7706
Processing batch 7170/7706
Processing batch 7171/7706
Processing batch 7172/7706
Processing batch 7173/7706
Processing batch 7174/7706
Processing batch 7175/7706
Processing batch 7176/7706
Processing batch 7177/7706
Processing batch 7178/7706
Processing batch 7179/7706
Processing batch 7180/7706
Proc

Processing batch 7444/7706
Processing batch 7445/7706
Processing batch 7446/7706
Processing batch 7447/7706
Processing batch 7448/7706
Processing batch 7449/7706
Processing batch 7450/7706
Processing batch 7451/7706
Processing batch 7452/7706
Processing batch 7453/7706
Processing batch 7454/7706
Processing batch 7455/7706
Processing batch 7456/7706
Processing batch 7457/7706
Problem with batch 7456
Processing batch 7458/7706
Problem with batch 7457
Processing batch 7459/7706
Processing batch 7460/7706
Processing batch 7461/7706
Processing batch 7462/7706
Processing batch 7463/7706
Processing batch 7464/7706
Processing batch 7465/7706
Processing batch 7466/7706
Processing batch 7467/7706
Processing batch 7468/7706
Processing batch 7469/7706
Processing batch 7470/7706
Problem with batch 7469
Processing batch 7471/7706
Processing batch 7472/7706
Processing batch 7473/7706
Processing batch 7474/7706
Processing batch 7475/7706
Processing batch 7476/7706
Processing batch 7477/7706
Processing

In [15]:
# Load all numpy files
numpy_files = [os.path.join(features_path, f) for f in os.listdir(features_path) if f.endswith('.npy')]
features_list = [np.load(features_file) for features_file in sorted(numpy_files)]

# Concatenate the features and store in a merged file
features = np.concatenate(features_list)
np.save(os.path.join(dataset_path, "features.npy"), features)

# Load all the photo IDs
csv_files = [os.path.join(features_path, f) for f in os.listdir(features_path) if f.endswith('.csv')]
photo_ids = pd.concat([pd.read_csv(ids_file) for ids_file in sorted(csv_files)])
photo_ids.to_csv(os.path.join(dataset_path,"photo_ids.csv"), index=False)

In [16]:
photo_data_path = os.path.join(dataset_path, 'photo_data.csv')

#open the previously generated csv file
with open(photo_data_path, 'w', encoding='UTF8', newline='') as photo_csv:
    photo_data = csv.writer(photo_csv)
    header = ['id', 'url', 'description']
    photo_data.writerow(header)

    # merge val and tain dataset to get more images to search through
    for coco_anotations in [os.path.join(dataset_path, 'annotations/captions_train2017.json'),
                            os.path.join(dataset_path, 'annotations/captions_val2017.json')]:
        with open(coco_anotations) as annotations_json:
            annotations = json.load(annotations_json)
            for image in annotations['images']:
                # gets annotation for current picture
                annotation = next(anno for anno in annotations['annotations'] if anno['image_id'] == image['id'])['caption']
                # saves image data in our own photo data
                data = [image['id'], image['coco_url'], annotation]
                photo_data.writerow(data)

    # delete all files exept for images json and 

### Step 3: Cleanup
Remove all data and directories that were used to construct the feature list and photo_data.

In [17]:
%%bash
cd coco_dataset
rm -r features
rm -r annotations
rm -r photos