<a href="https://colab.research.google.com/github/Germoe/Germoe.github.io/blob/master/OpenClip_Experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Clean up Sample Data
!rm -r ./sample_data

rm: cannot remove './sample_data': No such file or directory


In [3]:
!pip install -q sentence-transformers torch torchvision faiss-gpu

In [4]:
from sentence_transformers import SentenceTransformer
from PIL import Image
import os
import requests
import zipfile

import numpy as np
import pandas as pd
import glob

In [5]:
IMAGE_N = 1000

In [6]:
def download_file(url, filename):
    # Send a HTTP request to the URL
    r = requests.get(url, allow_redirects=True)

    # Write the content of the request to a file
    open(filename, 'wb').write(r.content)

def unzip_file(zip_filepath, dest_path):
    # Create a ZipFile Object
    with zipfile.ZipFile(zip_filepath) as zip_file:
        # Extract all the contents of zip file in current directory
        zip_file.extractall(dest_path)

def delete_file(filepath):
    # Remove the file
    os.remove(filepath)

def load_photos():
    path = './images/meta/'
    documents = ['photos']
    datasets = {}

    for doc in documents:
        files = glob.glob(path + doc + ".tsv*")

    subsets = []
    for filename in files:
        df = pd.read_csv(filename, sep='\t', header=0)
        subsets.append(df)

    datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

    return datasets["photos"]
    

try:
    # Load photos 
    photos = load_photos()
except:

    # Using the function
    url = 'https://unsplash.com/data/lite/latest' # your url
    filename = 'images.zip' # your filename
    download_file(url, filename)

    # Unzip the file
    unzip_file(filename, './images/meta')  # Change '/content' to the directory where you want to unzip the files.

    # Delete the zip file
    delete_file(filename)

    # Load the photos
    photos = load_photos()

    # Delete all files in the meta folder except the photos.tsv
    files = [f for f in glob.glob('./images/meta/*') if f != './images/meta/photos.tsv000']
    for f in files:
        delete_file(f)

In [7]:
import os
import re

downloaded_photos = []
for filepath in glob.glob("./images/*.jpg"):
  filename = os.path.basename(filepath)  # Get the file name from the path: '12Ed345.jpg'
  id = os.path.splitext(filename)[0]  # Split the file name and the extension: '12Ed345'
  downloaded_photos.append(id)

photos = photos.loc[photos['photo_id'].isin(downloaded_photos),:]
photos.shape

(1000, 31)

In [12]:
# Select N photos randomly
photos = photos.sample(n=IMAGE_N)
for photo in photos.iterrows():
    # Download the image
    url = photo[1]['photo_image_url']
    filename = photo[1]['photo_id'] + '.jpg'
    download_file(url, f"./images/{filename}")

In [32]:
from torch.cuda import empty_cache

img_model = SentenceTransformer('clip-ViT-B-32')

batch_size = 10
embeddings = []

for i in range(0, len(photos), batch_size):
    images = [Image.open(f"./images/{photo[1]['photo_id']}.jpg") for photo in photos.iloc[i:i+batch_size].iterrows()]
    embeddings.extend(img_model.encode(images))
    
    # Clearing up the memory after processing each batch
    del images
    empty_cache()

embeddings = np.array(embeddings)

In [33]:
import faiss
import math
from faiss import index_factory

COUNT, DIM = embeddings.shape

storage = "Flat"
cells = min(round(math.sqrt(COUNT)), int(COUNT / 39))
params = f"IVF{cells},{storage}"
index = index_factory(DIM, params)

res = faiss.StandardGpuResources()
index = faiss.index_cpu_to_gpu(res, 0, index)

In [54]:
ids = np.array([p[1]["photo_id"] for p in photos.iterrows()]).reshape(-1)

# Creating the mapping and reverse mapping
id_to_int = {id: i for i, id in enumerate(photos["photo_id"].unique())}
int_to_id = {i: id for id, i in id_to_int.items()}

# Now use the mapping to convert your ids to integers
ids_np = np.array([id_to_int[id] for id in photos["photo_id"]]).astype('int64')


In [55]:
index.train(embeddings)
index.add_with_ids(embeddings,ids_np)

In [30]:
# Text Model

text_model = SentenceTransformers("sentence")

['gSF9dHUVK_Y',
 'qjGz9PJg3sk',
 'd7t7TjsBfpY',
 'B3hNQbPSg0I',
 'lgMXYafVFiU',
 'NrflUuJJK0I',
 'ZSoyDUotA2c',
 'qy8XDguG9bw',
 'yJ9sc4peomM',
 'JFWZAABDDOo',
 '4Vo3VBP7aMs',
 'ls4OK8rINvc',
 'llCkggB7uCQ',
 'D5Qx3AbcGuM',
 'KOnHehVUYnQ',
 '_fJD6NyUR8I',
 'OcqGIG3Pkm8',
 'e3lFmTHKX0Q',
 'IcC2FkA7Fms',
 '2QX4INtycDQ',
 'Y3DW4MsvmFE',
 'pksP_JalmKI',
 'WZdGjSAabls',
 'i1T50iU9GV4',
 'FCqMgRuB4D4',
 '4UexhWLVcnY',
 'a_AFFD1U2LU',
 'C2a4RGapd8s',
 'wBeZ9p_3K0g',
 '0hvmAoYkQYM',
 'jzY47T8vh-U',
 'Qxzn3Fwy3qU',
 'cnyE0EnkrTg',
 'mVUxdi7yOOI',
 'gnda23A98w4',
 'RDLQMrWY5WA',
 'Bxwi4zf1vhs',
 '0PUQAOpLSIQ',
 '8C1_maRH3nY',
 'gxCFOgUrT-Q',
 'iFjUiWx8lKo',
 'LoiTMNlopsA',
 'kDrTt4sCC7I',
 'elJc2RFEg7M',
 'LWAgtKaUBpo',
 'MYnhW-AYmQM',
 '0PgC0jve7u4',
 'mqtn_g1ogIY',
 'ZUV1IgqvWlA',
 'N1vdGcHcrKI',
 'fh2JefbNlII',
 'BHuC9BoigOU',
 'Y5ZEfGtx6vU',
 'XjrglbPoMHo',
 'aSRT6yjo0nQ',
 'M5kI-5OFviE',
 'SE4jDznVwRg',
 'JrFfuHQr-L4',
 'SD68VmEjzdA',
 '2_mO5_qyevs',
 'cNL9PYsgVNE',
 'DVcODHQT2xo',
 '2aebgO

In [25]:
from google.colab import auth
auth.authenticate_user()

In [26]:
!pip install google-cloud-storage

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [27]:
from google.cloud import storage

def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)

    for blob in blobs:
        print(blob.name)

# replace 'your-bucket-name' with your bucket name
list_blobs('clip-indexes-experiment-initial')

In [None]:
ids = [12345]
filenames = [for id in ids]