# Import and run embeddings vectorization of iDigBio sample
This is my (Thomas Deatherage) first run at vectorizing a subset of the data from iDigBio to produce embeddings with the [CLIP](https://github.com/openai/CLIP) model.

To run this notebook, you'll need to make sure you have the right deps installed.  If you're using the Jupyter dev container included with this project, you'll need to `$ docker exec -it nfhm_devcontainer-jupyter-1 bash` and then run `$ pip install git+https://github.com/openai/CLIP.git` and `$ pip install pymongo`

## Import dependencies

In [10]:
import torch
import clip
from PIL import Image
from pymongo import MongoClient
import requests
from io import BytesIO


### Loading clip -- takes a few moments

In [13]:
device="cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [14]:
def generate_embeddings(image_url, text):
    # Fetch the image from the URL
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content))
    
    # Preprocess the image
    image = preprocess(image).unsqueeze(0).to(device)
    text = clip.tokenize([text]).to(device)
    
    # Generate embeddings
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
    
    # Normalize the embeddings
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    
    return image_features.cpu().numpy(), text_features.cpu().numpy()


## Example Datum from iDigBio

In [16]:
# Example usage with a record from iDigBio
# Text is a concatenation of the `dwc:higherClassification` and `dwc:vernacularName` fields
image_url = "https://images.collections.yale.edu/iiif/2/ypm:e1ce7f3b-fef8-44a3-bacf-60d65c5a478c/full/!1920,1920/0/default.jpg"
text = "brushfoots; butterflies; butterflies and moths; winged insects; insects; hexapods; arthropods; animals; Animalia; Arthropoda; Hexapoda; Insecta; Pterygota; Endopterygota; Lepidoptera; Ditrysia; Papilionoidea; Nymphalidae"
image_embedding, text_embedding = generate_embeddings(image_url, text)

print("Embeddings: ")
print(image_embedding)
print(text_embedding)


Embeddings: 
[[-5.60033917e-02 -3.17959972e-02 -9.05614067e-03  3.22202370e-02
   2.02837791e-02  2.07540113e-02  1.36874625e-02  3.76000106e-02
  -5.74969407e-03  1.62461940e-02  1.99553184e-02 -4.65960279e-02
   9.76790767e-03  3.49398032e-02 -1.56067451e-02  4.73900000e-03
   6.42741248e-02  2.84817368e-02 -1.78808961e-02 -2.59393342e-02
  -2.88059134e-02  3.61038223e-02  1.43290134e-02  1.78895108e-02
  -2.83237919e-02  1.02997152e-03 -1.60472710e-02  1.21921944e-02
   4.68959510e-02 -7.43976934e-03  2.22862586e-02 -1.07567906e-02
   2.08557677e-02 -1.70202516e-02  5.01147509e-02 -5.04499786e-02
   2.71958914e-02 -1.07433824e-02  2.67875586e-02 -1.86665244e-02
  -7.19632506e-02  2.81828791e-02  5.34232445e-02 -5.67098148e-02
   6.00042334e-03 -1.00860946e-01  2.70879529e-02  2.06459742e-02
  -2.37051733e-02  1.98593698e-02  2.49150097e-02  1.01665277e-02
   3.71353216e-02 -7.74086565e-02 -2.25958433e-02 -5.45770919e-04
  -1.85203832e-02  2.95188855e-02 -1.01998141e-02 -5.89239458e-

### Vectorize and store the embeddings

The following code pulls from the raw data collection in mongo, runs the `generate_embeddings` function and then writes those embeddings to another collection.

In [77]:
username = "root"
password = "example"
host = "mongo"
database_name = "NHFM"
raw_data_collection_name = "idigbio"
vectors_collection_name = "idigbio_embedded"
port = 27017

# Create the MongoDB URI and connect to the client
uri = f"mongodb://{username}:{password}@{host}:{port}/?authSource=admin" #{database_name}?authSource=admin

client = MongoClient(uri)

db = client[database_name]

def _fetch_records(collection, limit): # TODO: Add pagination
    collection = db[collection]
    records = collection.find().limit(limit)
    return records

def _store_embeddings(collection_list, dest_collection):
    to_be_inserted = []
    for record in collection_list:
        text = record['data']['dwc:vernacularName'] + record['data']['dwc:higherClassification']
        image_url = record['media'][0]['data']['ac:accessURI']
        embeddings = generate_embeddings(image_url, text)
        record_to_be_inserted = {
            "text": text,
            "image_url": image_url,
            "image_embedding": embeddings[0], # TODO: Use mongo vector support
            "text_embedding": embeddings[1]
        }
        to_be_inserted.append(record_to_be_inserted)

    print('Storing ' + len(to_be_inserted) + ' records')
    db[dest_collection].insert_many(to_be_inserted)


def fetch_records_and_store_embeddings(raw_data_collection_name, vectors_collection_name):
    limit = 10
    _records = _fetch_records(raw_data_collection_name, limit)
    records = list(_records)
    while (len(records) > 0):
        _store_embeddings(records, vectors_collection_name)
        break # <--- Remove to vectorize entire mongo collection


fetch_records_and_store_embeddings(raw_data_collection_name, vectors_collection_name)


storing records


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



InvalidDocument: cannot encode object: array([[-5.60033917e-02, -3.17959972e-02, -9.05614067e-03,
         3.22202370e-02,  2.02837791e-02,  2.07540113e-02,
         1.36874625e-02,  3.76000106e-02, -5.74969407e-03,
         1.62461940e-02,  1.99553184e-02, -4.65960279e-02,
         9.76790767e-03,  3.49398032e-02, -1.56067451e-02,
         4.73900000e-03,  6.42741248e-02,  2.84817368e-02,
        -1.78808961e-02, -2.59393342e-02, -2.88059134e-02,
         3.61038223e-02,  1.43290134e-02,  1.78895108e-02,
        -2.83237919e-02,  1.02997152e-03, -1.60472710e-02,
         1.21921944e-02,  4.68959510e-02, -7.43976934e-03,
         2.22862586e-02, -1.07567906e-02,  2.08557677e-02,
        -1.70202516e-02,  5.01147509e-02, -5.04499786e-02,
         2.71958914e-02, -1.07433824e-02,  2.67875586e-02,
        -1.86665244e-02, -7.19632506e-02,  2.81828791e-02,
         5.34232445e-02, -5.67098148e-02,  6.00042334e-03,
        -1.00860946e-01,  2.70879529e-02,  2.06459742e-02,
        -2.37051733e-02,  1.98593698e-02,  2.49150097e-02,
         1.01665277e-02,  3.71353216e-02, -7.74086565e-02,
        -2.25958433e-02, -5.45770919e-04, -1.85203832e-02,
         2.95188855e-02, -1.01998141e-02, -5.89239458e-03,
         6.14565276e-02, -1.58698428e-02,  5.11022343e-04,
        -3.04104052e-02, -2.24762801e-02, -1.74711403e-02,
         1.14117637e-02,  2.37134770e-02, -2.46741157e-02,
        -7.00030401e-02, -7.26243034e-02, -1.88709833e-02,
         2.11896189e-02, -7.91705493e-03,  1.12895835e-02,
         5.76799363e-03, -3.24233361e-02,  3.82718956e-03,
        -2.91810110e-02, -6.35491461e-02,  4.65200469e-03,
         1.83926411e-02, -6.64906343e-03, -2.32834630e-02,
        -3.86203416e-02,  2.37815864e-02,  1.43508464e-01,
        -1.96937080e-02,  1.20181972e-02,  6.42218394e-03,
        -1.94265204e-03, -4.74232398e-02, -5.65817833e-01,
         1.24431431e-01,  3.44917700e-02,  5.58979213e-02,
         4.59817909e-02,  3.35946567e-02, -5.86307272e-02,
         2.28082929e-02, -4.30974783e-03,  1.21888341e-02,
        -3.80167365e-02,  1.88672356e-02, -4.58231419e-02,
         5.77740930e-02, -2.98423409e-01,  1.17531167e-02,
        -8.75513011e-04,  5.06974794e-02,  2.07577627e-02,
         2.11318466e-03, -3.60883982e-03,  2.98842695e-02,
         1.09254941e-02, -7.79448356e-03,  1.25820022e-02,
         2.03308426e-02,  3.34798805e-02,  1.19229788e-02,
         5.82094975e-02, -6.61140159e-02,  2.33243196e-03,
        -2.53152940e-02,  9.91874933e-03, -2.57113613e-02,
        -1.94824804e-02, -1.30398257e-03, -2.48501692e-02,
         3.44670489e-02,  1.89521089e-02,  5.91383548e-03,
         5.77955879e-03,  8.43109563e-02,  4.42223884e-02,
        -1.81142073e-02, -3.17152552e-02,  4.22009267e-03,
         6.48072269e-03,  6.17684622e-04, -1.67055074e-02,
         4.20127437e-02, -7.39385486e-02,  3.61328050e-02,
        -6.96321204e-02,  1.23177180e-02,  2.13430617e-02,
        -4.58862893e-02, -6.41634241e-02,  1.31112402e-02,
        -8.24407581e-03, -1.88031904e-02, -5.09461686e-02,
        -4.57509309e-02, -9.52566857e-04, -6.07770793e-02,
        -1.09143220e-02, -2.69590989e-02, -1.71061419e-02,
         3.16203274e-02,  2.75872502e-04,  2.39921529e-02,
         4.40162048e-03, -9.25532542e-03, -6.41867518e-02,
        -8.36642552e-03,  4.80654612e-02,  1.02721434e-02,
        -1.30415754e-03,  1.77741274e-02,  1.07847936e-02,
        -1.42185614e-02, -1.58390980e-02,  9.74611472e-03,
         2.69116694e-03, -3.31059098e-02,  8.95468593e-02,
         2.67715473e-02,  2.48255711e-02,  2.93488312e-03,
         1.52163962e-02,  5.22544608e-02, -2.02776417e-02,
         2.94000991e-02, -2.83050351e-02, -3.21635641e-02,
         2.22308859e-02, -3.94725315e-02,  2.56229360e-02,
        -3.18536162e-02, -8.51062313e-02, -4.20652591e-02,
         6.55831546e-02,  2.16789311e-03,  7.74062146e-03,
        -3.20203453e-02, -1.55517906e-02,  2.65691858e-02,
        -4.07998040e-02, -4.88589145e-02, -2.10415684e-02,
         7.39015685e-03,  7.29323644e-03, -3.17402941e-04,
         3.73541526e-02, -3.06230411e-03,  3.18481959e-02,
        -3.48723633e-03,  2.66711507e-02,  8.36169720e-03,
        -2.64467280e-02,  7.47643858e-02,  6.07619667e-03,
        -1.95554853e-03,  1.88589785e-02, -1.14991292e-02,
         5.26461154e-02,  2.77377712e-03, -8.48379359e-02,
        -3.43121849e-02,  2.87849200e-03,  1.92922559e-02,
        -3.79422121e-02, -1.02564534e-02,  9.23118554e-03,
        -1.12668304e-02, -2.49166018e-03,  1.91124678e-02,
        -8.73936526e-03,  1.06825540e-03, -1.85277723e-02,
         4.43217643e-02, -4.24045697e-02,  2.31336560e-02,
         6.37327926e-03,  1.95515882e-02,  9.62802023e-03,
        -2.44353541e-05, -1.69121858e-03,  8.40809476e-03,
        -5.45327663e-02,  3.86376977e-02,  1.99375711e-02,
        -4.33459990e-02, -3.75482403e-02, -5.90684414e-02,
         2.29380112e-02,  7.30103329e-02,  3.32887433e-02,
         2.34842747e-02, -1.21961543e-02,  5.05770482e-02,
        -1.62064768e-02,  3.66322733e-02,  3.14158499e-02,
         1.23081245e-02,  1.99345890e-02, -2.52354182e-02,
        -8.34192557e-04,  3.32014263e-03,  5.03923856e-02,
         1.63954031e-02,  3.30715068e-02, -5.94822632e-04,
        -1.96662713e-02, -3.33402231e-02, -7.08259176e-03,
         2.62717418e-02,  2.43197065e-02,  9.84487403e-03,
        -3.07581574e-02, -2.72659101e-02, -3.37988064e-02,
         3.55147198e-02,  1.75979473e-02, -2.16313694e-02,
        -1.42059568e-02, -2.88604852e-03,  6.12075143e-02,
        -4.44472022e-02,  2.22716294e-02, -1.17561957e-02,
        -5.52226193e-02, -2.44987546e-04,  3.42447497e-02,
         2.34849732e-02, -8.17517713e-02, -3.69299315e-02,
         6.98191207e-03, -3.34262028e-02,  2.45410036e-02,
        -4.53176945e-02, -3.54625396e-02, -1.82455909e-02,
        -1.88214018e-03, -3.27440463e-02, -2.38651745e-02,
        -3.04186828e-02,  5.86961061e-02, -5.50556136e-03,
         5.47290919e-03,  1.56340469e-02, -1.92381032e-02,
         2.32409798e-02,  3.25273499e-02, -2.56525390e-02,
        -2.58209649e-02, -2.27118712e-02,  5.73695917e-03,
         3.76502573e-02, -2.67358567e-03, -1.57041699e-02,
         8.41820464e-02, -2.29739081e-02,  6.21370855e-04,
        -3.31416130e-02,  4.08989890e-03, -1.39750401e-02,
         1.13967955e-02, -5.10680899e-02,  6.98082894e-02,
         1.66602612e-01, -1.59319546e-02, -2.51904726e-02,
         2.27643810e-02,  5.45150600e-04, -5.38919540e-03,
         4.88043502e-02,  2.30157338e-02, -1.14835780e-02,
         5.79092605e-03, -3.51770446e-02,  6.06243834e-02,
        -1.46046113e-02,  2.08058003e-02,  3.32289445e-03,
         2.19976511e-02,  3.26987952e-02,  1.61487628e-02,
         2.01109480e-02, -1.57801826e-02, -2.50887517e-02,
         1.02960519e-01,  7.19850659e-02, -2.84285210e-02,
        -3.74479918e-03, -5.45713305e-03,  2.12708786e-02,
         1.60202768e-03, -3.59587371e-02,  8.09833705e-02,
        -2.99360491e-02, -1.72620770e-02, -1.49477115e-02,
        -2.26866733e-03, -3.02115474e-02, -6.43387297e-03,
        -8.21333528e-02, -2.16122288e-02, -5.01269149e-03,
         2.68607531e-02, -2.53640786e-02,  3.33894677e-02,
        -6.72375411e-02, -8.35031569e-02, -4.03345795e-03,
        -2.74301711e-02, -5.35319000e-02, -1.08167632e-02,
         6.05563968e-02,  2.25215945e-02, -5.19465841e-03,
         4.79796808e-03, -1.98654039e-03,  1.08817127e-02,
        -2.40191147e-02, -5.44929458e-03, -2.63175531e-03,
        -5.92972748e-02, -1.98239554e-02, -8.18736386e-03,
        -8.71591736e-03,  2.90918816e-02, -3.03282402e-02,
        -5.02401963e-03,  2.07677819e-02,  5.13379928e-03,
        -6.98379008e-03, -2.33464874e-03, -3.64763625e-02,
        -1.83187108e-02,  3.21226344e-02,  3.68096009e-02,
         1.35934819e-02, -6.13378547e-03, -3.02140471e-02,
        -1.41162388e-02,  9.53571405e-03, -9.13609192e-03,
        -1.53934443e-03, -5.14345150e-03, -1.05228703e-02,
         7.03178719e-02,  3.14321555e-03,  3.05737145e-02,
         1.73943210e-02,  4.18819934e-02, -2.18516309e-02,
         4.38922383e-02, -5.63142970e-02, -5.90561368e-02,
        -2.10356223e-03, -1.62569098e-02, -1.94875989e-02,
         1.29927145e-02, -1.91165097e-02,  1.66681241e-02,
         3.35149281e-02,  3.44567932e-03,  4.59314436e-02,
         2.20417939e-02, -4.80150841e-02, -1.29609481e-02,
         4.19958159e-02, -1.22272745e-02, -4.61383313e-02,
         3.95187933e-04, -4.34041396e-03, -1.80219598e-02,
         5.91635071e-02, -1.12723000e-02, -4.86652032e-02,
         4.84788530e-02, -1.89446972e-03, -3.29216793e-02,
        -4.61318195e-02, -1.31362034e-02, -1.11997838e-03,
        -2.13362481e-02, -1.68203954e-02,  1.15708038e-02,
         1.07572330e-02,  9.43224132e-03,  7.45864213e-03,
        -4.00509350e-02, -1.07695144e-02,  1.58151910e-02,
        -4.11399454e-03, -7.35724866e-02,  3.51809412e-02,
        -3.65123935e-02,  2.73047201e-02,  1.24026025e-02,
        -7.15064481e-02,  4.31255735e-02, -6.77111559e-03,
         3.61459777e-02,  5.26893325e-02,  1.56718977e-02,
         7.49972509e-03,  3.24627720e-02, -2.56167147e-02,
         2.43809111e-02, -5.10539375e-02, -1.69238076e-02,
        -1.35079538e-02, -4.79041524e-02, -7.25361193e-03,
        -1.63215250e-02, -1.99931720e-03, -5.14753386e-02,
         9.88780241e-03,  2.88347341e-02,  1.71524119e-02,
        -2.62912717e-02, -3.32252414e-04,  5.78152994e-03,
        -1.66958198e-02, -2.60828733e-02,  1.71529502e-02,
        -8.98505934e-03,  1.86396036e-02, -5.68361115e-03,
         3.98909561e-02,  6.46533491e-03, -1.74405668e-02,
        -1.47121428e-02, -1.97928268e-02, -1.49678700e-02,
        -1.03771491e-02, -2.03449186e-03, -9.40877944e-03,
         6.91902637e-03, -2.77980193e-02,  1.09854611e-02,
         5.16942330e-02,  1.86346397e-02,  2.63752565e-02,
        -2.38680234e-03, -1.07372804e-02,  9.47863888e-03,
         2.53287554e-02, -5.84981125e-03,  4.05998249e-03,
         3.32206190e-02, -4.08234596e-02, -2.44765244e-02,
         3.29700001e-02,  2.14042831e-02,  9.34377313e-02,
        -1.70020142e-03, -6.27755281e-03]], dtype=float32), of type: <class 'numpy.ndarray'>