## Generate Owner's Manual Text Embeddings for Cloud SQL

Megan O'Keefe, 2024

In [None]:
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
import nltk
from google.cloud import aiplatform
import vertexai
from vertexai.language_models import TextEmbeddingModel

nltk.download("punkt")

In [4]:
# Set to your project and location
PROJECT_ID = "your-project-id"
REGION = "us-central1"  # change region as needed
MODEL = "textembedding-gecko@003"  # change model as needed

In [None]:
vertexai.init(project=PROJECT_ID)

In [5]:
def text_embedding(doc) -> list:
    model = TextEmbeddingModel.from_pretrained(MODEL)
    embeddings = model.get_embeddings([doc])
    if len(embeddings) > 1:
        raise ValueError("More than one embedding returned.")
    if len(embeddings) == 0:
        raise ValueError("No embedding returned.")
    return embeddings[0].values

In [None]:
# OCR and chunk PDF
page_texts = []
with pdfplumber.open("../../manuals/cymbal-starlight-2024.pdf") as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        page_texts.append(text)
for i in range(0, len(page_texts)):
    # remove newlines and empty lines
    p = page_texts[i]
    p = p.replace("\n", " ")
    p = p.replace("- ", "")
    page_texts[i] = p
print(page_texts[2])

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
chunks = []
fn = "Cymbal Starlight 2024 "
for p in page_texts:
    spl = text_splitter.split_text(p)
    for s in spl:
        s = fn + " " + s
        chunks.append(s)

In [None]:
len(chunks)

In [None]:
# Generate embeddings for each text chunk, store in dictionary
embeddings_dict = {}
for i in range(0, len(chunks)):
    c = chunks[i]
    # log progress
    if i % 10 == 0:
        print(i)
    e = text_embedding(c)
    embeddings_dict[c] = e

In [None]:
df = pd.DataFrame(list(embeddings_dict.items()), columns=["name", "embedding"])
df.head()

In [15]:
df.to_csv("cymbal-starlight-2024.csv", index=False)