# Generate Owner's Manual Text Embeddings for Cloud SQL

In [1]:
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/mokeefe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
print("hello world")

hello world


In [3]:
from google.cloud import aiplatform
import vertexai
from vertexai.language_models import TextEmbeddingModel

In [4]:
# set google cloud project ID. replace with your project ID!
PROJECT_ID = "PROJECT-ID"

In [5]:
def text_embedding(doc) -> list:
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")
    embeddings = model.get_embeddings([doc])
    if len(embeddings) > 1:
        raise ValueError("More than one embedding returned.")
    if len(embeddings) == 0:
        raise ValueError("No embedding returned.")
    return embeddings[0].values

In [6]:
# verify we can reach the Vertex AI API
vertexai.init(project=PROJECT_ID)
print(text_embedding("hello world!"))

[0.04643995687365532, -0.040134966373443604, -0.026658955961465836, -0.030130811035633087, 0.015209157951176167, -0.002992452820762992, 0.025391723960638046, -0.013013788498938084, 0.014430120587348938, 0.005930031184107065, 0.03672889620065689, 0.05480571836233139, -0.018730252981185913, -0.07940302044153214, 0.006020653061568737, -0.020779868587851524, 0.009315159171819687, -0.007861842401325703, 0.006406503729522228, -0.010877937078475952, -0.007521060295403004, 0.00758568849414587, -0.03905414789915085, -0.02675125002861023, 0.017720919102430344, 0.020115388557314873, -0.0010165664134547114, -0.06540928035974503, 0.00994604080915451, 0.06652805209159851, -0.03825473412871361, 0.014684486202895641, -0.05307602509856224, 0.015796786174178123, 0.038084737956523895, -0.04685615003108978, 0.03181130439043045, 0.022280927747488022, -0.005270056892186403, -0.004752118140459061, 0.018728133291006088, -0.09290434420108795, -0.03316306322813034, -0.03694036230444908, 0.057201724499464035, -0

In [8]:
# OCR and chunk PDF
page_texts = []
with pdfplumber.open("../../manuals/cymbal-starlight-2024.pdf") as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        page_texts.append(text)
for i in range(0, len(page_texts)):
    # remove newlines and empty lines
    p = page_texts[i]
    p = p.replace("\n", " ")
    p = p.replace("- ", "")
    page_texts[i] = p
print(page_texts[2])

manual.md 2024-03-23 should be used until your child is at least 4 feet 9 inches tall and weighs at least 80 pounds. Seat belt: All children over the age of 8 should wear a seat belt. Seat belts should be adjusted to fit snugly around your child's hips and across their chest. Installing Child Restraints Follow the manufacturer's instructions carefully when installing a child restraint. Use the LATCH (Lower Anchors and Tethers for Children) system or the vehicle's seat belt to secure the child restraint. Make sure that the child restraint is installed tightly and securely. Have the child restraint inspected by a qualified technician to ensure that it is installed correctly. Other Child Safety Tips Never leave a child unattended in the vehicle. Lock all doors and windows when leaving the vehicle parked. Keep valuables out of sight to avoid tempting thieves. Teach your children about car safety, including the importance of wearing seat belts and never playing in or around the vehicle. Set

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
chunks = []
fn = "Cymbal Starlight 2024 "
for p in page_texts:
    spl = text_splitter.split_text(p)
    for s in spl:
        s = fn + " " + s
        chunks.append(s)

In [11]:
len(chunks)

118

In [12]:
# Generate embeddings for each text chunk, store in dictionary
d = {}
for i in range(0, len(chunks)):
    c = chunks[i]
    # log progress
    if i % 10 == 0:
        print(i)
    e = text_embedding(c)
    d[c] = e

0
10
20
30
40
50
60
70
80
90
100
110


In [13]:
import pandas as pd

df = pd.DataFrame(list(d.items()), columns=["name", "embedding"])
df.head()

Unnamed: 0,name,embedding
0,Cymbal Starlight 2024 manual.md 2024-03-23 Cy...,"[0.018377870321273804, -0.042374949902296066, ..."
1,Cymbal Starlight 2024 snugly around your hips...,"[0.01160965021699667, -0.023575348779559135, -..."
2,Cymbal Starlight 2024 age of 12 should never ...,"[0.020014842972159386, -0.04700048640370369, -..."
3,Cymbal Starlight 2024 (VSC) VSC is an electro...,"[0.012970546260476112, -0.06115831062197685, -..."
4,Cymbal Starlight 2024 manual.md 2024-03-23 VS...,"[0.046555180102586746, -0.030612556263804436, ..."


In [14]:
# reorder columns to id, name, embedding
df = df[["name", "embedding"]]
df.head()

Unnamed: 0,name,embedding
0,Cymbal Starlight 2024 manual.md 2024-03-23 Cy...,"[0.018377870321273804, -0.042374949902296066, ..."
1,Cymbal Starlight 2024 snugly around your hips...,"[0.01160965021699667, -0.023575348779559135, -..."
2,Cymbal Starlight 2024 age of 12 should never ...,"[0.020014842972159386, -0.04700048640370369, -..."
3,Cymbal Starlight 2024 (VSC) VSC is an electro...,"[0.012970546260476112, -0.06115831062197685, -..."
4,Cymbal Starlight 2024 manual.md 2024-03-23 VS...,"[0.046555180102586746, -0.030612556263804436, ..."


In [15]:
df.to_csv("cymbal-starlight-2024.csv", index=False)