# Embed with Azure AI
After getting the `.json` files from `textOCR.ipynb`, this notebook demonstrates how to embed the files and save to `.parquet` format.

## Setup
Import environment files as usual.

In [None]:
import os
import json
from dotenv import load_dotenv
from openai import AzureOpenAI
import pandas as pd
import time

from openai import AzureOpenAI

load_dotenv(override=True)
endpoint = os.getenv("AZURE_DI_ENDPOINT")
key = os.getenv("AZURE_DI_KEY")

## Helper Functions
The following are helper functions for embedding.

In [None]:
def setup_embedding_client():
    client = AzureOpenAI(
        api_key = os.getenv("EMBEDDING_OPENAI_API_KEY"),
        api_version = os.getenv("EMBEDDING_OPENAI_API_VERSION"),
        azure_endpoint = os.getenv("EMBEDDING_OPENAI_API_ENDPOINT")
    )
    return client

In [None]:
def embed_text(embedding_client, paragraph_results):
    embedding_model = os.getenv("EMBEDDING_DEPLOYMENT_NAME")
    for count, item in enumerate(paragraph_results):
        paragraph_results[count]['contentVector'] = embedding_client.embeddings.create(input = [item['content']], model=embedding_model).data[0].embedding
        print(f"Got {count+1} out of {len(paragraph_results)} embeddings")
    return paragraph_results

In [None]:
def embed_image(embedding_client, data):
    embedding_model = os.getenv("EMBEDDING_DEPLOYMENT_NAME")
    for count, item in enumerate(data):
        retry_count = 0
        while retry_count < 2:
            try:
                data[count]['captionVector'] = embedding_client.embeddings.create(input = [item['caption']], model=embedding_model).data[0].embedding
                print(f"Got {count+1} out of {len(data)} embeddings")
                break
            except (openai.BadRequestError, openai.InternalServerError) as e:
                retry_count += 1
                if retry_count == 2:
                    print(f"Error embedding {data[count]['image']} after retry\n")
                    traceback.print_exc()
                else:
                    print(f"Retrying to embed {data[count]['image']}\n")
                    time.sleep(1)  # Wait for 1 second before retrying
    return data

## Embed and save to `.parquet` files
Now we can embed the text and images separately and save the results to separate parquet files.

In [None]:
# Change this to your own file name
text_json_file_name = "textOCR.json"
image_json_file_name = "imagecaption.json"
with open(text_json_file_name, 'r') as document:
    document_text = json.load(document)
with open(image_json_file_name, 'r') as document:
    document_image = json.load(document)
embedding_client = setup_embedding_client()

- Embed text and save to "xxx_text.parquet", clean any missing entries

In [None]:
start_time = time.time()
paragraph_results = embed_text(embedding_client, document_text)
df = pd.DataFrame(paragraph_results)

# Remove rows with empty or missing 'content'
df = df.dropna(subset=['content'])
df = df[df['content'] != '']

# Remove rows with empty or missing 'contentVector'
df = df.dropna(subset=['contentVector'])
df = df[df['contentVector'].apply(lambda x: len(x) > 0)]

# Reset the index if needed
df = df.reset_index(drop=True)

print(df)
text_parquet_file_name = "demofile_text.parquet"
df.to_parquet(text_parquet_file_name, engine="pyarrow")
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

- Embed images and save to "xxx_image.parquet"

In [None]:
start_time = time.time()
image_results = embed_image(embedding_client, document_image)
df = pd.DataFrame(image_results)
# Remove rows with empty or missing 'caption'
df = df.dropna(subset=['caption'])
df = df[df['caption'] != '']

# Remove rows with empty or missing 'captionVector'
df = df.dropna(subset=['captionVector'])
df = df[df['captionVector'].apply(lambda x: len(x) > 0)]

# Reset the index if needed
df = df.reset_index(drop=True)

print(df)
image_parquet_file_name = "demofile_image.parquet"
df.to_parquet(image_parquet_file_name, engine='pyarrow')
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

After getting the parquet files, check 'rag.ipynb' to run RAG on your own local dataset.