# Natural Language Processing and the Web WS 2021/22 - Practice Class: 

# Project on Multimodal Image Processing

## Names: Luca Ciegelski, Luka Zdravkovic and Lennart Bengtson

## Preparing the coco Dataset https://cocodataset.org/#home

### Step 1: Download the dataset

To use the coco dataset funktionality of pytorch [doku here](https://pytorch.org/vision/stable/datasets.html#coco) teh coco api needs to be installed. For this run the makefile from the coco api github like this:

In [2]:
%%bash
ls coco > /dev/null || git clone https://github.com/pdollar/coco/
cd coco/PythonAPI
make
python setup.py install
cd ../..
rm -rf coco

python setup.py build_ext --inplace
running build_ext
cythoning pycocotools/_mask.pyx to pycocotools/_mask.c
building 'pycocotools._mask' extension
creating build
creating build/common
creating build/temp.linux-x86_64-3.9
creating build/temp.linux-x86_64-3.9/pycocotools
gcc -pthread -B /home/luka/anaconda3/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/luka/anaconda3/include -I/home/luka/anaconda3/include -fPIC -O2 -isystem /home/luka/anaconda3/include -fPIC -I/home/luka/anaconda3/lib/python3.9/site-packages/numpy/core/include -I../common -I/home/luka/anaconda3/include/python3.9 -c ../common/maskApi.c -o build/temp.linux-x86_64-3.9/../common/maskApi.o -Wno-cpp -Wno-unused-function -std=c99
gcc -pthread -B /home/luka/anaconda3/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/luka/anaconda3/include -I/home/luka/anaconda3/include -fPIC -O2 -isystem /home/luka/anaconda3/include -fPIC -I/home/luka/an

ls: cannot access 'coco': No such file or directory
Cloning into 'coco'...
  tree = Parsing.p_module(s, pxd, full_module_name)
../common/maskApi.c: In function ‘rleDecode’:
   46 |       for( k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
      |       ^~~
../common/maskApi.c:46:49: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘for’
   46 |       for( k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
      |                                                 ^
../common/maskApi.c: In function ‘rleFrPoly’:
  166 |   for(j=0; j<k; j++) x[j]=(int)(scale*xy[j*2+0]+.5); x[k]=x[0];
      |   ^~~
../common/maskApi.c:166:54: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the ‘for’
  166 |   for(j=0; j<k; j++) x[j]=(int)(scale*xy[j*2+0]+.5); x[k]=x[0];
      |                                                      ^
  167 |   for(j=0; j<k; j++) y[j]=(int)(scale*xy[j*2+1]+.5); y[k]=y[0];
      |   ^~~
../common/mas

In [None]:
%%bash
# creating coco dataset feature dir that will be needed late
mkdir -p coco_dataset/features

# creating coco_dataset dir if it does not exist and going into it
cd coco_dataset

touch photo_data.csv

echo 'downloading train2017.zip...'
aria2c -x 10 -j 10 http://images.cocodataset.org/zips/train2017.zip
echo 'downloading val2017.zip...'
aria2c -x 10 -j 10 http://images.cocodataset.org/zips/val2017.zip
echo 'unziping files'
unzip \*.zip

#TODO test if this works
# renames one folder to photos and combines content of both
mv train2017/ photos/
mv val2017/* photos/
rmdir val2017
# rm \*.zip

cd ..
echo 'downloading annotations_trainval2017.zip...'
aria2c -x 10 -j 10 http://images.cocodataset.org/annotations/annotations_trainval2017.zip
echo 'unziping files'
unzip \*.zip
# rm \*.zip

### Step 2: Process the dataset using the model

In [27]:
# imports
import os
import math
import numpy as np
import pandas as pd
import csv
import json

import torch
from PIL import Image
from IPython.display import Image as im
from transformers import CLIPProcessor, CLIPModel

In [3]:
# set the path to the dataset
cwd = os.getcwd()
dataset_path = os.path.join(cwd, "coco_dataset")

# Set the path to the photos
photos_path = os.path.join(dataset_path, "photos")

# List all JPGs in the folder
# photos_files = list(photos_path.glob("*.jpg"))
photos_files = [os.path.join(photos_path, f) for f in os.listdir(photos_path) if f.endswith('.jpg')]

# Print some statistics
print(f"Photos found: {len(photos_files)}")

Photos found: 118287


In [4]:
# Load the open CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Function that computes the feature vectors for a batch of images
def compute_clip_features(photos_batch):
    # Load all the photos from the files
    photos = [Image.open(photo_file) for photo_file in photos_batch]
    
    # Preprocess all photos
    photos_preprocessed = processor(images=photos, return_tensors='pt').to(device)
    # print(photos_preprocessed)

    with torch.no_grad():
        # Encode the photos batch to compute the feature vectors and normalize them
        photos_features = model.get_image_features(**photos_preprocessed)
        photos_features /= photos_features.norm(dim=-1, keepdim=True)

    # Transfer the feature vectors back to the CPU and convert to numpy
    return photos_features.cpu().numpy()

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [5]:
# Define the batch size so that it fits on your GPU. You can also do the processing on the CPU, but it will be slower.
batch_size = 16

# Path where the feature vectors will be stored
features_path = os.path.join(dataset_path, "features")

# Compute how many batches are needed
batches = math.ceil(len(photos_files) / batch_size)

# Process each batch
for i in range(batches):
    print(f"Processing batch {i+1}/{batches}")

    batch_ids_path = os.path.join(features_path, f"{i:010d}.csv")
    batch_features_path = os.path.join(features_path, f"{i:010d}.npy")
    
    # Only do the processing if the batch wasn't processed yet
    if not os.path.isfile(batch_features_path):
        try:
            # Select the photos for the current batch
            batch_files = photos_files[i*batch_size : (i+1)*batch_size]

            # Compute the features and save to a numpy file
            batch_features = compute_clip_features(batch_files)
            np.save(batch_features_path, batch_features)

            # Save the photo IDs to a CSV file
            photo_ids = [os.path.splitext(os.path.basename(photo_file))[0] for photo_file in batch_files]
            photo_ids_data = pd.DataFrame(photo_ids, columns=['photo_id'])
            photo_ids_data.to_csv(batch_ids_path, index=False)
        except:
            # Catch problems with the processing to make the process more robust
            print(f'Problem with batch {i}')

Processing batch 1/7393
Processing batch 2/7393
Problem with batch 1
Processing batch 3/7393
Processing batch 4/7393
Processing batch 5/7393
Processing batch 6/7393
Processing batch 7/7393
Processing batch 8/7393
Processing batch 9/7393
Processing batch 10/7393
Processing batch 11/7393
Problem with batch 10
Processing batch 12/7393
Processing batch 13/7393
Processing batch 14/7393
Processing batch 15/7393
Processing batch 16/7393
Processing batch 17/7393
Processing batch 18/7393
Problem with batch 17
Processing batch 19/7393
Processing batch 20/7393
Processing batch 21/7393
Processing batch 22/7393
Processing batch 23/7393
Processing batch 24/7393
Processing batch 25/7393
Processing batch 26/7393
Processing batch 27/7393
Processing batch 28/7393
Processing batch 29/7393
Processing batch 30/7393
Processing batch 31/7393
Processing batch 32/7393
Processing batch 33/7393
Processing batch 34/7393
Processing batch 35/7393
Processing batch 36/7393
Processing batch 37/7393
Processing batch 38

Problem with batch 322
Processing batch 324/7393
Processing batch 325/7393
Processing batch 326/7393
Processing batch 327/7393
Processing batch 328/7393
Processing batch 329/7393
Processing batch 330/7393
Processing batch 331/7393
Processing batch 332/7393
Processing batch 333/7393
Processing batch 334/7393
Processing batch 335/7393
Processing batch 336/7393
Processing batch 337/7393
Processing batch 338/7393
Processing batch 339/7393
Processing batch 340/7393
Processing batch 341/7393
Processing batch 342/7393
Processing batch 343/7393
Processing batch 344/7393
Processing batch 345/7393
Processing batch 346/7393
Processing batch 347/7393
Processing batch 348/7393
Processing batch 349/7393
Processing batch 350/7393
Processing batch 351/7393
Problem with batch 350
Processing batch 352/7393
Processing batch 353/7393
Processing batch 354/7393
Processing batch 355/7393
Processing batch 356/7393
Processing batch 357/7393
Processing batch 358/7393
Processing batch 359/7393
Processing batch 3

Problem with batch 687
Processing batch 689/7393
Processing batch 690/7393
Processing batch 691/7393
Processing batch 692/7393
Processing batch 693/7393
Processing batch 694/7393
Processing batch 695/7393
Processing batch 696/7393
Processing batch 697/7393
Processing batch 698/7393
Processing batch 699/7393
Processing batch 700/7393
Processing batch 701/7393
Processing batch 702/7393
Processing batch 703/7393
Processing batch 704/7393
Processing batch 705/7393
Processing batch 706/7393
Processing batch 707/7393
Processing batch 708/7393
Processing batch 709/7393
Processing batch 710/7393
Processing batch 711/7393
Processing batch 712/7393
Processing batch 713/7393
Processing batch 714/7393
Processing batch 715/7393
Processing batch 716/7393
Processing batch 717/7393
Processing batch 718/7393
Processing batch 719/7393
Processing batch 720/7393
Processing batch 721/7393
Processing batch 722/7393
Processing batch 723/7393
Processing batch 724/7393
Processing batch 725/7393
Processing batc

Problem with batch 1029
Processing batch 1031/7393
Problem with batch 1030
Processing batch 1032/7393
Processing batch 1033/7393
Processing batch 1034/7393
Processing batch 1035/7393
Processing batch 1036/7393
Processing batch 1037/7393
Processing batch 1038/7393
Processing batch 1039/7393
Processing batch 1040/7393
Processing batch 1041/7393
Processing batch 1042/7393
Processing batch 1043/7393
Processing batch 1044/7393
Processing batch 1045/7393
Processing batch 1046/7393
Processing batch 1047/7393
Processing batch 1048/7393
Processing batch 1049/7393
Processing batch 1050/7393
Processing batch 1051/7393
Processing batch 1052/7393
Processing batch 1053/7393
Processing batch 1054/7393
Processing batch 1055/7393
Processing batch 1056/7393
Processing batch 1057/7393
Processing batch 1058/7393
Processing batch 1059/7393
Processing batch 1060/7393
Processing batch 1061/7393
Processing batch 1062/7393
Processing batch 1063/7393
Processing batch 1064/7393
Processing batch 1065/7393
Process

Problem with batch 1325
Processing batch 1327/7393
Processing batch 1328/7393
Processing batch 1329/7393
Processing batch 1330/7393
Processing batch 1331/7393
Processing batch 1332/7393
Processing batch 1333/7393
Processing batch 1334/7393
Processing batch 1335/7393
Processing batch 1336/7393
Processing batch 1337/7393
Processing batch 1338/7393
Processing batch 1339/7393
Processing batch 1340/7393
Processing batch 1341/7393
Processing batch 1342/7393
Processing batch 1343/7393
Processing batch 1344/7393
Processing batch 1345/7393
Processing batch 1346/7393
Processing batch 1347/7393
Processing batch 1348/7393
Processing batch 1349/7393
Processing batch 1350/7393
Problem with batch 1349
Processing batch 1351/7393
Processing batch 1352/7393
Processing batch 1353/7393
Processing batch 1354/7393
Processing batch 1355/7393
Processing batch 1356/7393
Processing batch 1357/7393
Processing batch 1358/7393
Processing batch 1359/7393
Processing batch 1360/7393
Processing batch 1361/7393
Process

Problem with batch 1673
Processing batch 1675/7393
Processing batch 1676/7393
Processing batch 1677/7393
Processing batch 1678/7393
Processing batch 1679/7393
Processing batch 1680/7393
Processing batch 1681/7393
Problem with batch 1680
Processing batch 1682/7393
Processing batch 1683/7393
Processing batch 1684/7393
Processing batch 1685/7393
Processing batch 1686/7393
Processing batch 1687/7393
Processing batch 1688/7393
Processing batch 1689/7393
Processing batch 1690/7393
Processing batch 1691/7393
Processing batch 1692/7393
Problem with batch 1691
Processing batch 1693/7393
Processing batch 1694/7393
Processing batch 1695/7393
Processing batch 1696/7393
Processing batch 1697/7393
Processing batch 1698/7393
Processing batch 1699/7393
Processing batch 1700/7393
Processing batch 1701/7393
Processing batch 1702/7393
Processing batch 1703/7393
Processing batch 1704/7393
Processing batch 1705/7393
Processing batch 1706/7393
Processing batch 1707/7393
Processing batch 1708/7393
Problem wi

Problem with batch 1975
Processing batch 1977/7393
Processing batch 1978/7393
Processing batch 1979/7393
Processing batch 1980/7393
Processing batch 1981/7393
Processing batch 1982/7393
Processing batch 1983/7393
Processing batch 1984/7393
Problem with batch 1983
Processing batch 1985/7393
Processing batch 1986/7393
Problem with batch 1985
Processing batch 1987/7393
Processing batch 1988/7393
Processing batch 1989/7393
Processing batch 1990/7393
Processing batch 1991/7393
Processing batch 1992/7393
Processing batch 1993/7393
Processing batch 1994/7393
Processing batch 1995/7393
Processing batch 1996/7393
Processing batch 1997/7393
Problem with batch 1996
Processing batch 1998/7393
Processing batch 1999/7393
Processing batch 2000/7393
Processing batch 2001/7393
Processing batch 2002/7393
Processing batch 2003/7393
Processing batch 2004/7393
Processing batch 2005/7393
Processing batch 2006/7393
Processing batch 2007/7393
Processing batch 2008/7393
Processing batch 2009/7393
Processing ba

Problem with batch 2278
Processing batch 2280/7393
Processing batch 2281/7393
Processing batch 2282/7393
Processing batch 2283/7393
Processing batch 2284/7393
Processing batch 2285/7393
Processing batch 2286/7393
Processing batch 2287/7393
Processing batch 2288/7393
Processing batch 2289/7393
Processing batch 2290/7393
Processing batch 2291/7393
Processing batch 2292/7393
Processing batch 2293/7393
Processing batch 2294/7393
Processing batch 2295/7393
Processing batch 2296/7393
Processing batch 2297/7393
Processing batch 2298/7393
Processing batch 2299/7393
Processing batch 2300/7393
Processing batch 2301/7393
Processing batch 2302/7393
Processing batch 2303/7393
Processing batch 2304/7393
Processing batch 2305/7393
Processing batch 2306/7393
Processing batch 2307/7393
Processing batch 2308/7393
Processing batch 2309/7393
Processing batch 2310/7393
Processing batch 2311/7393
Processing batch 2312/7393
Processing batch 2313/7393
Processing batch 2314/7393
Processing batch 2315/7393
Proc

Problem with batch 2596
Processing batch 2598/7393
Processing batch 2599/7393
Processing batch 2600/7393
Processing batch 2601/7393
Processing batch 2602/7393
Processing batch 2603/7393
Processing batch 2604/7393
Processing batch 2605/7393
Processing batch 2606/7393
Problem with batch 2605
Processing batch 2607/7393
Processing batch 2608/7393
Processing batch 2609/7393
Processing batch 2610/7393
Problem with batch 2609
Processing batch 2611/7393
Processing batch 2612/7393
Processing batch 2613/7393
Processing batch 2614/7393
Processing batch 2615/7393
Processing batch 2616/7393
Processing batch 2617/7393
Processing batch 2618/7393
Processing batch 2619/7393
Processing batch 2620/7393
Processing batch 2621/7393
Processing batch 2622/7393
Processing batch 2623/7393
Processing batch 2624/7393
Processing batch 2625/7393
Processing batch 2626/7393
Processing batch 2627/7393
Processing batch 2628/7393
Processing batch 2629/7393
Problem with batch 2628
Processing batch 2630/7393
Processing ba

Problem with batch 2898
Processing batch 2900/7393
Processing batch 2901/7393
Processing batch 2902/7393
Processing batch 2903/7393
Processing batch 2904/7393
Processing batch 2905/7393
Processing batch 2906/7393
Processing batch 2907/7393
Processing batch 2908/7393
Processing batch 2909/7393
Processing batch 2910/7393
Processing batch 2911/7393
Processing batch 2912/7393
Processing batch 2913/7393
Processing batch 2914/7393
Processing batch 2915/7393
Processing batch 2916/7393
Processing batch 2917/7393
Processing batch 2918/7393
Processing batch 2919/7393
Processing batch 2920/7393
Processing batch 2921/7393
Processing batch 2922/7393
Processing batch 2923/7393
Processing batch 2924/7393
Processing batch 2925/7393
Processing batch 2926/7393
Processing batch 2927/7393
Processing batch 2928/7393
Processing batch 2929/7393
Processing batch 2930/7393
Processing batch 2931/7393
Processing batch 2932/7393
Processing batch 2933/7393
Processing batch 2934/7393
Processing batch 2935/7393
Proc

Problem with batch 3261
Processing batch 3263/7393
Processing batch 3264/7393
Processing batch 3265/7393
Processing batch 3266/7393
Processing batch 3267/7393
Processing batch 3268/7393
Processing batch 3269/7393
Processing batch 3270/7393
Processing batch 3271/7393
Processing batch 3272/7393
Processing batch 3273/7393
Processing batch 3274/7393
Processing batch 3275/7393
Processing batch 3276/7393
Processing batch 3277/7393
Processing batch 3278/7393
Processing batch 3279/7393
Processing batch 3280/7393
Processing batch 3281/7393
Processing batch 3282/7393
Processing batch 3283/7393
Processing batch 3284/7393
Processing batch 3285/7393
Processing batch 3286/7393
Processing batch 3287/7393
Processing batch 3288/7393
Problem with batch 3287
Processing batch 3289/7393
Processing batch 3290/7393
Processing batch 3291/7393
Processing batch 3292/7393
Processing batch 3293/7393
Processing batch 3294/7393
Processing batch 3295/7393
Processing batch 3296/7393
Processing batch 3297/7393
Process

Problem with batch 3645
Processing batch 3647/7393
Processing batch 3648/7393
Processing batch 3649/7393
Processing batch 3650/7393
Processing batch 3651/7393
Processing batch 3652/7393
Processing batch 3653/7393
Processing batch 3654/7393
Processing batch 3655/7393
Processing batch 3656/7393
Processing batch 3657/7393
Processing batch 3658/7393
Processing batch 3659/7393
Processing batch 3660/7393
Processing batch 3661/7393
Processing batch 3662/7393
Processing batch 3663/7393
Processing batch 3664/7393
Processing batch 3665/7393
Processing batch 3666/7393
Processing batch 3667/7393
Processing batch 3668/7393
Processing batch 3669/7393
Processing batch 3670/7393
Processing batch 3671/7393
Processing batch 3672/7393
Processing batch 3673/7393
Processing batch 3674/7393
Processing batch 3675/7393
Processing batch 3676/7393
Processing batch 3677/7393
Processing batch 3678/7393
Processing batch 3679/7393
Processing batch 3680/7393
Processing batch 3681/7393
Processing batch 3682/7393
Proc

Processing batch 3952/7393
Processing batch 3953/7393
Processing batch 3954/7393
Processing batch 3955/7393
Processing batch 3956/7393
Processing batch 3957/7393
Processing batch 3958/7393
Processing batch 3959/7393
Processing batch 3960/7393
Processing batch 3961/7393
Processing batch 3962/7393
Problem with batch 3961
Processing batch 3963/7393
Processing batch 3964/7393
Processing batch 3965/7393
Processing batch 3966/7393
Processing batch 3967/7393
Processing batch 3968/7393
Processing batch 3969/7393
Processing batch 3970/7393
Processing batch 3971/7393
Processing batch 3972/7393
Processing batch 3973/7393
Processing batch 3974/7393
Processing batch 3975/7393
Processing batch 3976/7393
Processing batch 3977/7393
Processing batch 3978/7393
Processing batch 3979/7393
Problem with batch 3978
Processing batch 3980/7393
Processing batch 3981/7393
Processing batch 3982/7393
Processing batch 3983/7393
Processing batch 3984/7393
Processing batch 3985/7393
Processing batch 3986/7393
Process

Problem with batch 4280
Processing batch 4282/7393
Processing batch 4283/7393
Processing batch 4284/7393
Processing batch 4285/7393
Processing batch 4286/7393
Processing batch 4287/7393
Processing batch 4288/7393
Processing batch 4289/7393
Processing batch 4290/7393
Processing batch 4291/7393
Processing batch 4292/7393
Processing batch 4293/7393
Processing batch 4294/7393
Processing batch 4295/7393
Processing batch 4296/7393
Processing batch 4297/7393
Processing batch 4298/7393
Processing batch 4299/7393
Processing batch 4300/7393
Processing batch 4301/7393
Processing batch 4302/7393
Processing batch 4303/7393
Processing batch 4304/7393
Processing batch 4305/7393
Processing batch 4306/7393
Processing batch 4307/7393
Processing batch 4308/7393
Processing batch 4309/7393
Processing batch 4310/7393
Processing batch 4311/7393
Processing batch 4312/7393
Processing batch 4313/7393
Processing batch 4314/7393
Processing batch 4315/7393
Processing batch 4316/7393
Processing batch 4317/7393
Proc

Problem with batch 4593
Processing batch 4595/7393
Processing batch 4596/7393
Processing batch 4597/7393
Processing batch 4598/7393
Problem with batch 4597
Processing batch 4599/7393
Processing batch 4600/7393
Processing batch 4601/7393
Processing batch 4602/7393
Processing batch 4603/7393
Processing batch 4604/7393
Processing batch 4605/7393
Processing batch 4606/7393
Processing batch 4607/7393
Processing batch 4608/7393
Processing batch 4609/7393
Processing batch 4610/7393
Processing batch 4611/7393
Processing batch 4612/7393
Processing batch 4613/7393
Processing batch 4614/7393
Processing batch 4615/7393
Processing batch 4616/7393
Processing batch 4617/7393
Processing batch 4618/7393
Processing batch 4619/7393
Processing batch 4620/7393
Processing batch 4621/7393
Processing batch 4622/7393
Problem with batch 4621
Processing batch 4623/7393
Processing batch 4624/7393
Processing batch 4625/7393
Processing batch 4626/7393
Processing batch 4627/7393
Processing batch 4628/7393
Processing

Problem with batch 4890
Processing batch 4892/7393
Processing batch 4893/7393
Processing batch 4894/7393
Problem with batch 4893
Processing batch 4895/7393
Processing batch 4896/7393
Processing batch 4897/7393
Processing batch 4898/7393
Processing batch 4899/7393
Processing batch 4900/7393
Processing batch 4901/7393
Processing batch 4902/7393
Processing batch 4903/7393
Problem with batch 4902
Processing batch 4904/7393
Processing batch 4905/7393
Processing batch 4906/7393
Processing batch 4907/7393
Processing batch 4908/7393
Processing batch 4909/7393
Processing batch 4910/7393
Processing batch 4911/7393
Problem with batch 4910
Processing batch 4912/7393
Processing batch 4913/7393
Processing batch 4914/7393
Processing batch 4915/7393
Processing batch 4916/7393
Processing batch 4917/7393
Processing batch 4918/7393
Processing batch 4919/7393
Processing batch 4920/7393
Processing batch 4921/7393
Processing batch 4922/7393
Processing batch 4923/7393
Processing batch 4924/7393
Processing ba

Problem with batch 5223
Processing batch 5225/7393
Processing batch 5226/7393
Processing batch 5227/7393
Processing batch 5228/7393
Processing batch 5229/7393
Processing batch 5230/7393
Processing batch 5231/7393
Processing batch 5232/7393
Processing batch 5233/7393
Processing batch 5234/7393
Processing batch 5235/7393
Processing batch 5236/7393
Processing batch 5237/7393
Processing batch 5238/7393
Processing batch 5239/7393
Processing batch 5240/7393
Processing batch 5241/7393
Processing batch 5242/7393
Processing batch 5243/7393
Processing batch 5244/7393
Processing batch 5245/7393
Processing batch 5246/7393
Processing batch 5247/7393
Processing batch 5248/7393
Processing batch 5249/7393
Processing batch 5250/7393
Processing batch 5251/7393
Processing batch 5252/7393
Processing batch 5253/7393
Processing batch 5254/7393
Processing batch 5255/7393
Processing batch 5256/7393
Processing batch 5257/7393
Processing batch 5258/7393
Processing batch 5259/7393
Processing batch 5260/7393
Proc

Problem with batch 5526
Processing batch 5528/7393
Problem with batch 5527
Processing batch 5529/7393
Processing batch 5530/7393
Processing batch 5531/7393
Processing batch 5532/7393
Processing batch 5533/7393
Processing batch 5534/7393
Processing batch 5535/7393
Processing batch 5536/7393
Processing batch 5537/7393
Processing batch 5538/7393
Processing batch 5539/7393
Processing batch 5540/7393
Processing batch 5541/7393
Processing batch 5542/7393
Processing batch 5543/7393
Processing batch 5544/7393
Processing batch 5545/7393
Processing batch 5546/7393
Processing batch 5547/7393
Processing batch 5548/7393
Processing batch 5549/7393
Processing batch 5550/7393
Processing batch 5551/7393
Processing batch 5552/7393
Processing batch 5553/7393
Processing batch 5554/7393
Processing batch 5555/7393
Processing batch 5556/7393
Problem with batch 5555
Processing batch 5557/7393
Processing batch 5558/7393
Processing batch 5559/7393
Processing batch 5560/7393
Processing batch 5561/7393
Processing

Problem with batch 5820
Processing batch 5822/7393
Processing batch 5823/7393
Processing batch 5824/7393
Processing batch 5825/7393
Processing batch 5826/7393
Processing batch 5827/7393
Processing batch 5828/7393
Processing batch 5829/7393
Processing batch 5830/7393
Processing batch 5831/7393
Processing batch 5832/7393
Processing batch 5833/7393
Processing batch 5834/7393
Processing batch 5835/7393
Processing batch 5836/7393
Processing batch 5837/7393
Processing batch 5838/7393
Processing batch 5839/7393
Processing batch 5840/7393
Processing batch 5841/7393
Processing batch 5842/7393
Processing batch 5843/7393
Processing batch 5844/7393
Processing batch 5845/7393
Processing batch 5846/7393
Processing batch 5847/7393
Processing batch 5848/7393
Processing batch 5849/7393
Processing batch 5850/7393
Processing batch 5851/7393
Processing batch 5852/7393
Processing batch 5853/7393
Processing batch 5854/7393
Processing batch 5855/7393
Processing batch 5856/7393
Processing batch 5857/7393
Proc

Problem with batch 6160
Processing batch 6162/7393
Processing batch 6163/7393
Processing batch 6164/7393
Processing batch 6165/7393
Processing batch 6166/7393
Processing batch 6167/7393
Processing batch 6168/7393
Processing batch 6169/7393
Processing batch 6170/7393
Processing batch 6171/7393
Processing batch 6172/7393
Processing batch 6173/7393
Processing batch 6174/7393
Processing batch 6175/7393
Processing batch 6176/7393
Processing batch 6177/7393
Processing batch 6178/7393
Processing batch 6179/7393
Processing batch 6180/7393
Processing batch 6181/7393
Processing batch 6182/7393
Processing batch 6183/7393
Processing batch 6184/7393
Processing batch 6185/7393
Processing batch 6186/7393
Processing batch 6187/7393
Processing batch 6188/7393
Processing batch 6189/7393
Processing batch 6190/7393
Processing batch 6191/7393
Processing batch 6192/7393
Processing batch 6193/7393
Processing batch 6194/7393
Processing batch 6195/7393
Processing batch 6196/7393
Processing batch 6197/7393
Proc

Problem with batch 6468
Processing batch 6470/7393
Processing batch 6471/7393
Processing batch 6472/7393
Processing batch 6473/7393
Processing batch 6474/7393
Processing batch 6475/7393
Processing batch 6476/7393
Processing batch 6477/7393
Processing batch 6478/7393
Processing batch 6479/7393
Processing batch 6480/7393
Processing batch 6481/7393
Processing batch 6482/7393
Processing batch 6483/7393
Processing batch 6484/7393
Processing batch 6485/7393
Problem with batch 6484
Processing batch 6486/7393
Processing batch 6487/7393
Processing batch 6488/7393
Processing batch 6489/7393
Processing batch 6490/7393
Processing batch 6491/7393
Processing batch 6492/7393
Processing batch 6493/7393
Processing batch 6494/7393
Processing batch 6495/7393
Processing batch 6496/7393
Processing batch 6497/7393
Processing batch 6498/7393
Processing batch 6499/7393
Processing batch 6500/7393
Processing batch 6501/7393
Processing batch 6502/7393
Processing batch 6503/7393
Processing batch 6504/7393
Process

Problem with batch 6766
Processing batch 6768/7393
Processing batch 6769/7393
Processing batch 6770/7393
Processing batch 6771/7393
Processing batch 6772/7393
Processing batch 6773/7393
Processing batch 6774/7393
Processing batch 6775/7393
Processing batch 6776/7393
Processing batch 6777/7393
Processing batch 6778/7393
Processing batch 6779/7393
Processing batch 6780/7393
Processing batch 6781/7393
Processing batch 6782/7393
Processing batch 6783/7393
Processing batch 6784/7393
Processing batch 6785/7393
Processing batch 6786/7393
Processing batch 6787/7393
Processing batch 6788/7393
Processing batch 6789/7393
Processing batch 6790/7393
Processing batch 6791/7393
Processing batch 6792/7393
Processing batch 6793/7393
Processing batch 6794/7393
Processing batch 6795/7393
Processing batch 6796/7393
Processing batch 6797/7393
Processing batch 6798/7393
Processing batch 6799/7393
Processing batch 6800/7393
Processing batch 6801/7393
Processing batch 6802/7393
Processing batch 6803/7393
Proc

Problem with batch 7104
Processing batch 7106/7393
Processing batch 7107/7393
Processing batch 7108/7393
Processing batch 7109/7393
Processing batch 7110/7393
Processing batch 7111/7393
Processing batch 7112/7393
Processing batch 7113/7393
Problem with batch 7112
Processing batch 7114/7393
Processing batch 7115/7393
Processing batch 7116/7393
Processing batch 7117/7393
Processing batch 7118/7393
Processing batch 7119/7393
Processing batch 7120/7393
Processing batch 7121/7393
Processing batch 7122/7393
Processing batch 7123/7393
Processing batch 7124/7393
Processing batch 7125/7393
Processing batch 7126/7393
Processing batch 7127/7393
Processing batch 7128/7393
Processing batch 7129/7393
Processing batch 7130/7393
Processing batch 7131/7393
Processing batch 7132/7393
Processing batch 7133/7393
Processing batch 7134/7393
Processing batch 7135/7393
Problem with batch 7134
Processing batch 7136/7393
Processing batch 7137/7393
Processing batch 7138/7393
Processing batch 7139/7393
Processing

In [6]:
# Load all numpy files
numpy_files = [os.path.join(features_path, f) for f in os.listdir(features_path) if f.endswith('.npy')]
features_list = [np.load(features_file) for features_file in sorted(numpy_files)]

# Concatenate the features and store in a merged file
features = np.concatenate(features_list)
np.save(os.path.join(dataset_path, "features.npy"), features)

# Load all the photo IDs
csv_files = [os.path.join(features_path, f) for f in os.listdir(features_path) if f.endswith('.csv')]
photo_ids = pd.concat([pd.read_csv(ids_file) for ids_file in sorted(csv_files)])
photo_ids.to_csv(os.path.join(dataset_path,"photo_ids.csv"), index=False)

KeyboardInterrupt: 

In [30]:
photo_data_path = os.path.join(dataset_path, 'photo_data.csv')

#open the previously generated csv file
with open(photo_data_path, 'w', encoding='UTF8', newline='') as photo_csv:
    photo_data = csv.writer(photo_csv)
    header = ['id', 'url', 'description']
    photo_data.writerow(header)

    # merge val and tain dataset to get more images to search through
    for coco_anotations in [os.path.join(dataset_path, 'annotations/captions_train2017.json'),
                            os.path.join(dataset_path, 'annotations/captions_val2017.json')]:
        with open(coco_anotations) as annotations_json:
            annotations = json.load(annotations_json)
            for image in annotations['images']:
                # gets annotation for current picture
                annotation = next(anno for anno in annotations['annotations'] if anno['image_id'] == image['id'])['caption']
                # saves image data in our own photo data
                data = [image['id'], image['coco_url'], annotation]
                photo_data.writerow(data)

    # delete all files exept for images json and 

### Step 3: Cleanup
Remove all data and directories that were used to construct the feature list and photo_data.

In [36]:
%%bash
cd coco_dataset
rm -r features
rm -r annotations
rm -r photos