# Natural Language Processing and the Web WS 2021/22 - Practice Class: 

# Project on Multimodal Image Processing

## Names: Luca Ciegelski, Luka Zdravkovic and Lennart Bengtson

In [5]:
%%bash
echo 'downloading unsplash photo.tsv00'
mkdir unsplash_dataset
cd unsplash_dataset
echo 'downloading photos.tsv000...'
aria2c -x 10 -j 10 https://cloud.mafiasi.de/s/sAg5DCQ7nebnjYd/download/photos.tsv000 --check-certificate=false

downloading unsplash photo.tsv00
downloading photos.tsv000...

01/15 09:42:45 [[1;32mNOTICE[0m] Downloading 1 item(s)

01/15 09:42:45 [[1;32mNOTICE[0m] Allocating disk space. Use --file-allocation=none to disable it. See --file-allocation option in man page for more details.
[#234c84 2.0MiB/8.6MiB(23%) CN:1 DL:2.8MiB ETA:2s]

01/15 09:42:47 [[1;32mNOTICE[0m] Download complete: /mnt/e/projects/multimodal_image_processing_search_engine/unsplash_dataset/photos.tsv000

Download Results:
gid   |stat|avg speed  |path/URI
234c84|OK  |   5.4MiB/s|/mnt/e/projects/multimodal_image_processing_search_engine/unsplash_dataset/photos.tsv000

Status Legend:
(OK):download completed.


In [6]:
# imports
import os
import math
import numpy as np
import pandas as pd
import urllib.request
import csv

import torch
from PIL import Image
from IPython.display import Image as im
from IPython.core.display import HTML
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer # TODO tokenizer weg

### Step 1: Download the dataset

In [7]:
cwd = os.getcwd()
dataset_path = os.path.join(cwd, "unsplash_dataset")
# unsplash_dataset_path = Path("unsplash-dataset") / dataset_version

# Read the photos table
photos = pd.read_csv(os.path.join(dataset_path, "photos.tsv000"), sep='\t', header=0)

# Extract the IDs and the URLs of the photos
photo_urls = photos[['photo_id', 'photo_image_url']].values.tolist()

In [8]:
%%bash
mkdir -p unsplash_dataset/photos

In [9]:
# Path where the photos will be downloaded
photos_download_path = os.path.join(dataset_path, "photos")
print(photos_download_path)

# creates the photo folder if it does not exist
directory = os.path.dirname(photos_download_path)
if not os.path.exists(directory):
    os.makedirs(directory)

# Function that downloads a single photo
def download_photo(photo):
    # Get the ID of the photo
    photo_id = photo[0]

    # Get the URL of the photo (setting the width to 640 pixels)
    photo_url = photo[1] + "?w=640"

    # Path where the photo will be stored
    photo_path = os.path.join(photos_download_path, photo_id + ".jpg")

    # Only download a photo if it doesn't exist
    if not os.path.isfile(photo_path):
        try:
            urllib.request.urlretrieve(photo_url, photo_path)
        except:
            # Catch the exception if the download fails for some reason
            print(f"Cannot download {photo_url}")
            pass

/mnt/e/projects/multimodal_image_processing_search_engine/unsplash_dataset/photos


In [10]:
from multiprocessing.pool import ThreadPool

# Create the thread pool
threads_count = 16
pool = ThreadPool(threads_count)

# Start the download
pool.map(download_photo, photo_urls)

# Display some statistics
display(f'Photos downloaded: {len(photos)}')

Cannot download https://images.unsplash.com/photo-1481930079977-24a345fcae85?w=640
Cannot download https://images.unsplash.company%20by%20Alessandro%20Desantis%20-%20Downloaded%20from%20500px_jpg.jpg?w=640
Cannot download https://images.unsplash.com_TheBeach.jpg?w=640
Cannot download https://images.unsplash.com-grass-sun.jpg?w=640
Cannot download https://images.unsplash.com/photo-1435874348809-6e61b02aa907?w=640


'Photos downloaded: 25000'

### Step 2: Process the dataset using the model

In [11]:
# Set the path to the photos
photos_path = os.path.join(dataset_path, "photos")

# List all JPGs in the folder
# photos_files = list(photos_path.glob("*.jpg"))
photos_files = [os.path.join(photos_path, f) for f in os.listdir(photos_path) if f.endswith('.jpg')]

# Print some statistics
print(f"Photos found: {len(photos_files)}")

Photos found: 24995


In [12]:
# Load the open CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Function that computes the feature vectors for a batch of images
def compute_clip_features(photos_batch):
    # Load all the photos from the files
    photos = [Image.open(photo_file) for photo_file in photos_batch]
    
    # Preprocess all photos
    photos_preprocessed = processor(images=photos, return_tensors='pt').to(device)
    # print(photos_preprocessed)

    with torch.no_grad():
        # Encode the photos batch to compute the feature vectors and normalize them
        photos_features = model.get_image_features(**photos_preprocessed)
        photos_features /= photos_features.norm(dim=-1, keepdim=True)

    # Transfer the feature vectors back to the CPU and convert to numpy
    return photos_features.cpu().numpy()

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [13]:
%%bash
mkdir -p unsplash_dataset/features

In [14]:
# Define the batch size so that it fits on your GPU. You can also do the processing on the CPU, but it will be slower.
batch_size = 16

# Path where the feature vectors will be stored
features_path = os.path.join(dataset_path, "features")

# Compute how many batches are needed
batches = math.ceil(len(photos_files) / batch_size)

# Process each batch
for i in range(batches):
    print(f"Processing batch {i+1}/{batches}")

    batch_ids_path = os.path.join(features_path, f"{i:010d}.csv")
    batch_features_path = os.path.join(features_path, f"{i:010d}.npy")
    
    # Only do the processing if the batch wasn't processed yet
    if not os.path.isfile(batch_features_path):
        try:
            # Select the photos for the current batch
            batch_files = photos_files[i*batch_size : (i+1)*batch_size]

            # Compute the features and save to a numpy file
            batch_features = compute_clip_features(batch_files)
            np.save(batch_features_path, batch_features)

            # Save the photo IDs to a CSV file
            photo_ids = [os.path.splitext(os.path.basename(photo_file))[0] for photo_file in batch_files]
            photo_ids_data = pd.DataFrame(photo_ids, columns=['photo_id'])
            photo_ids_data.to_csv(batch_ids_path, index=False)
        except:
            # Catch problems with the processing to make the process more robust
            print(f'Problem with batch {i}')

Processing batch 1/1563
Processing batch 2/1563
Processing batch 3/1563
Processing batch 4/1563
Processing batch 5/1563
Processing batch 6/1563
Processing batch 7/1563
Processing batch 8/1563
Processing batch 9/1563
Processing batch 10/1563
Processing batch 11/1563
Processing batch 12/1563
Processing batch 13/1563
Processing batch 14/1563
Processing batch 15/1563
Processing batch 16/1563
Processing batch 17/1563
Processing batch 18/1563
Processing batch 19/1563
Processing batch 20/1563
Processing batch 21/1563
Processing batch 22/1563
Processing batch 23/1563
Processing batch 24/1563
Processing batch 25/1563
Processing batch 26/1563
Processing batch 27/1563
Processing batch 28/1563
Processing batch 29/1563
Processing batch 30/1563
Processing batch 31/1563
Processing batch 32/1563
Processing batch 33/1563
Processing batch 34/1563
Processing batch 35/1563
Processing batch 36/1563
Processing batch 37/1563
Processing batch 38/1563
Processing batch 39/1563
Processing batch 40/1563
Processin

Processing batch 321/1563
Processing batch 322/1563
Processing batch 323/1563
Processing batch 324/1563
Processing batch 325/1563
Processing batch 326/1563
Processing batch 327/1563
Processing batch 328/1563
Processing batch 329/1563
Processing batch 330/1563
Processing batch 331/1563
Processing batch 332/1563
Processing batch 333/1563
Processing batch 334/1563
Processing batch 335/1563
Processing batch 336/1563
Processing batch 337/1563
Processing batch 338/1563
Processing batch 339/1563
Processing batch 340/1563
Processing batch 341/1563
Processing batch 342/1563
Processing batch 343/1563
Processing batch 344/1563
Processing batch 345/1563
Processing batch 346/1563
Processing batch 347/1563
Processing batch 348/1563
Processing batch 349/1563
Processing batch 350/1563
Processing batch 351/1563
Processing batch 352/1563
Processing batch 353/1563
Processing batch 354/1563
Processing batch 355/1563
Processing batch 356/1563
Processing batch 357/1563
Processing batch 358/1563
Processing b

Processing batch 637/1563
Processing batch 638/1563
Processing batch 639/1563
Processing batch 640/1563
Processing batch 641/1563
Processing batch 642/1563
Processing batch 643/1563
Processing batch 644/1563
Processing batch 645/1563
Processing batch 646/1563
Processing batch 647/1563
Processing batch 648/1563
Processing batch 649/1563
Processing batch 650/1563
Processing batch 651/1563
Processing batch 652/1563
Processing batch 653/1563
Processing batch 654/1563
Processing batch 655/1563
Processing batch 656/1563
Processing batch 657/1563
Processing batch 658/1563
Processing batch 659/1563
Processing batch 660/1563
Processing batch 661/1563
Processing batch 662/1563
Processing batch 663/1563
Processing batch 664/1563
Processing batch 665/1563
Processing batch 666/1563
Processing batch 667/1563
Processing batch 668/1563
Processing batch 669/1563
Processing batch 670/1563
Processing batch 671/1563
Processing batch 672/1563
Processing batch 673/1563
Processing batch 674/1563
Processing b

Processing batch 952/1563
Processing batch 953/1563
Processing batch 954/1563
Processing batch 955/1563
Processing batch 956/1563
Processing batch 957/1563
Problem with batch 956
Processing batch 958/1563
Processing batch 959/1563
Processing batch 960/1563
Processing batch 961/1563
Processing batch 962/1563
Processing batch 963/1563
Processing batch 964/1563
Processing batch 965/1563
Processing batch 966/1563
Processing batch 967/1563
Processing batch 968/1563
Processing batch 969/1563
Processing batch 970/1563
Processing batch 971/1563
Processing batch 972/1563
Processing batch 973/1563
Processing batch 974/1563
Processing batch 975/1563
Processing batch 976/1563
Processing batch 977/1563
Processing batch 978/1563
Processing batch 979/1563
Processing batch 980/1563
Processing batch 981/1563
Processing batch 982/1563
Processing batch 983/1563
Processing batch 984/1563
Processing batch 985/1563
Processing batch 986/1563
Processing batch 987/1563
Processing batch 988/1563
Processing batc

Processing batch 1256/1563
Processing batch 1257/1563
Processing batch 1258/1563
Processing batch 1259/1563
Processing batch 1260/1563
Processing batch 1261/1563
Processing batch 1262/1563
Processing batch 1263/1563
Processing batch 1264/1563
Processing batch 1265/1563
Processing batch 1266/1563
Processing batch 1267/1563
Processing batch 1268/1563
Processing batch 1269/1563
Processing batch 1270/1563
Processing batch 1271/1563
Processing batch 1272/1563
Processing batch 1273/1563
Processing batch 1274/1563
Processing batch 1275/1563
Processing batch 1276/1563
Processing batch 1277/1563
Processing batch 1278/1563
Processing batch 1279/1563
Processing batch 1280/1563
Problem with batch 1279
Processing batch 1281/1563
Processing batch 1282/1563
Processing batch 1283/1563
Processing batch 1284/1563
Processing batch 1285/1563
Processing batch 1286/1563
Processing batch 1287/1563
Processing batch 1288/1563
Processing batch 1289/1563
Processing batch 1290/1563
Processing batch 1291/1563
Proc

Processing batch 1557/1563
Processing batch 1558/1563
Processing batch 1559/1563
Processing batch 1560/1563
Processing batch 1561/1563
Processing batch 1562/1563
Processing batch 1563/1563


In [10]:
# Load all numpy files
numpy_files = [os.path.join(features_path, f) for f in os.listdir(features_path) if f.endswith('.npy')]
features_list = [np.load(features_file) for features_file in sorted(numpy_files)]

# Concatenate the features and store in a merged file
features = np.concatenate(features_list)
np.save(os.path.join(dataset_path, "features.npy"), features)

# Load all the photo IDs
csv_files = [os.path.join(features_path, f) for f in os.listdir(features_path) if f.endswith('.csv')]
photo_ids = pd.concat([pd.read_csv(ids_file) for ids_file in sorted(csv_files)])
photo_ids.to_csv(os.path.join(dataset_path,"photo_ids.csv"), index=False)

In [44]:
%%bash
cd unsplash_dataset
touch photo_data.csv

In [45]:
photo_data_path = os.path.join(dataset_path, 'photo_data.csv')

# reduces old csv to just 3 columns and renames them
photo_data = photos[['photo_id', 'photo_image_url', 'photo_description']]
photo_data.rename(columns={'photo_id': 'id', 'photo_image_url': 'url', 'photo_description': 'description'}, inplace=True)

# save csv
photo_data.to_csv(photo_data_path, index=False)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


### Step 3: Cleanup
Remove all data and directories that were used to construct the feature list and photo_data.

In [40]:
%%bash
cd unsplash_dataset
rm photos.tsv000
rm -r photos/
rm -r features/