In [3]:
import pandas as pd
from PIL import Image
import open_clip
import requests
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import math
import json


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
input_file_name = "/Users/elangrossman/Downloads/idigbio.json"

In [5]:

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-B-32')

In [6]:
def extract_data(file_location):
    with open (file_location, "rb") as f:
        lines = f.readlines()
    new_lines = []
    for line in lines:
        line = json.loads(line)
        
        media = line["media"]
        for med in media:
            d = {}
            d["scientificName"] = line["data"]["dwc:scientificName"]
            if "ac:accessURI" in med["data"]:
                url = med["data"]["ac:accessURI"]
                d["media_location"] = url
                new_lines.append(d)

    return pd.DataFrame(new_lines)
    

In [21]:
def download_image_and_preprocess(image_location):
    """This method downloads an array of images and outputs thier vectors in memory"""
    content = requests.get(image_location).content
    img = Image.open(BytesIO(content))
    image = preprocess(img).unsqueeze(0)
    return model.encode_image(image)

###
def process_images(file_location):
    """This section details how to convert these batches to vectors! We use multi-threading to download many images at once. 
    We don't save any of the images and only compute them in memory. The next step will
    """

    df = extract_data(file_location)

    batch_size=10
    num_chunks = math.ceil(len(df)/batch_size)

    with open("vectors.txt", "a") as out_file:

        for chunk in np.array_split(df, num_chunks):
            media_locations = chunk["media_location"]
            with ThreadPoolExecutor(max_workers=5) as executor:
                downloaded_images = executor.map(download_image_and_preprocess, media_locations)

                ### Do something with images (probably store them somewhere)
                for image in downloaded_images:
                    line = ', '.join(map(str, image.detach().numpy().tolist()))

                    out_file.write(line + '\n')
                    

    # return all_images



In [22]:
process_images(input_file_name)