<td>
   <a target="_blank" href="https://labelbox.com" ><img src="https://labelbox.com/blog/content/images/2021/02/logo-v4.svg" width=256/></a>
</td>

<td>
<a href="https://colab.research.google.com/drive/1rF9inkLhxMNejXL8bW-ESX_cVAEct1-6" target="_blank"><img
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
</td>

<td>
<a href="https://github.com/Labelbox/labelpandas/blob/main/notebooks/blip-auto-captions.ipynb" target="_blank"><img
src="https://img.shields.io/badge/GitHub-100000?logo=github&logoColor=white" alt="GitHub"></a>
</td>

# _**Auto-Generating Captions for Images and Uploading to Labelbox**_

Install Hugging Face library for BLIP and Labelbox connectors

In [None]:
!pip install transformers -q # Installing Hugging Face BLIP model libraries
!pip install labelpandas --upgrade -q # LabelPandas allows us to easily with DataFrames

In [None]:
!git clone https://github.com/Labelbox/labelpandas -q # For this demo, we will get images from the LabelPandas GitHub repo

Provide a Labelbox API key and source data to generate captions for

In [None]:
api_key = "" # Labelbox API Key
csv_path = "https://raw.githubusercontent.com/Labelbox/labelpandas/main/datasets/blip-images.csv" # Path to your CSV file

Read CSV data to Pandas DataFrame

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(csv_path) # Load in your CSV as a Pandas DataFrame
df.head()

Load local files to Labelbox URLs

In [None]:
import labelpandas as lp

In [None]:
client = lp.Client(api_key)

In [None]:
# Load local files as Labelbox URLs
df = lp.load_local_files(
    client=client,
    table=df, 
    file_path_column="file_path",
    verbose=True
)

The below code will add model-generated text captions as columns to a DataFrame of image URLs

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration

import requests
from PIL import Image
from tqdm import tqdm
from io import BytesIO

def process_batch(batch, processor, model):
    """ Given a data row, returns a Pandas DataFrame-ready row as-a-dictionary with global key, row data URL, and caption metadata values
    Args:
        batch       :   List of dictionaries with "row_data" and "global_key" keys
        processor   :   BlipProcessor object
        model       :   BlipForConditionalGeneration object
    Returns:
        List of dictionaries with keys "row_data", "global_key" and two metadata columns for conditional and unconditional caption
    """  
    # Download images from local files
    images = [Image.open(x["file_path"]) for x in batch]
    # Conditional (prime the model with a text prefix)
    conditional_inputs = processor(images=images, text=["a picture of"]*len(images), add_special_tokens=True, truncation=True, padding=True, return_tensors="pt") # Encode the images
    conditional_outputs = model.generate(**conditional_inputs) # Run model
    conditional_captions = [processor.decode(output, skip_special_tokens=True) for output in conditional_outputs] # Decode model output
    # Unconditional
    unconditional_inputs = processor(images=images, add_special_tokens=True, truncation=True, padding=True, return_tensors="pt") # Encode the images
    unconditional_outputs = model.generate(**unconditional_inputs) # Run model
    unconditional_captions = [processor.decode(output, skip_special_tokens=True) for output in unconditional_outputs] # Decode model output    

    data_list = [{
        "global_key": batch[i]["global_key"], "row_data": batch[i]["row_data"],
        "metadata///string///unconditional_caption": unconditional_captions[i],
        "metadata///string///conditional_caption": conditional_captions[i]
    } for i in range(0, len(images))]
    
    return data_list

def generate_captions_for_data_rows(dataframe, batch_size=16):
    """ Creates captions from data rows and returns a Pandas DataFrame
    Args:
        api_key         : Required (str) - Labeblox API key
        dataframe       : Required (pandas.core.frame.DataFrame) - Pandas DataFrame 
        batch_size      : Optional (int) - Number of images to process at a time
    Returns:
        Pandas DataFrame with model-generated captions as metadata string values 
    """  
    table_dict = dataframe.to_dict('records') # Convert DataFrame to list of dictionary rows

    # Download blip model which generates text captions for images
    print(f'Downloading BLIP model')
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", return_tensors="pt") # Get your processor object from hugging face
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # Get your pretrained model object from hugging face

    batch_num = len(table_dict) // batch_size if len(table_dict) % batch_size == 0 else (len(table_dict) // batch_size) + 1
    print(f'Downloading, encoding and creating predictions for data rows in {batch_num} batches of batch size {batch_size}')

    # Loop through your data rows as batches to craete a list of dictionaries where { key = column_name : value = row_value }
    data = []
    for i in tqdm(range(0, len(table_dict), batch_size), total=batch_num):
        res = process_batch(table_dict[i:i+batch_size], processor, model)
        data.extend(res)

    df = pd.DataFrame(data)
    print(f'Success: captions generated')  
    
    return df

In [None]:
df = generate_captions_for_data_rows(df) # Run the above code, generating captions for images

In [None]:
df.head()

Upload your Data Row DataFrame to Labelbox, including the model-generated captions as metadata

In [None]:
# Create a Labelbox dataset to upload data rows to
dataset = client.lb_client.create_dataset(name="blip-augment-data-search-demo")

# Create data rows using LabelPandas
results = client.create_data_rows_from_table(
    table = df,
    dataset_id = dataset.uid,
    skip_duplicates = False, 
    verbose = True
)