# Initialising Embeddings

In [1]:
import os
from supabase import create_client, Client
from dotenv import load_dotenv 
load_dotenv("../secrets/.env.dev")

url = os.environ.get("SUPABASE_URL")
key = os.environ.get("SUPABASE_KEY")
password = os.environ.get("SUPABASE_PASSWORD")
ref = os.environ.get("SUPABASE_REF")
supabase: Client = create_client(url, key)

In [2]:
from google import genai
from google.genai import types
from dotenv import load_dotenv
import os

load_dotenv("../secrets/.env.dev")
gemini_key = os.environ.get("GEMINI_API_KEY")

client = genai.Client(
    api_key=gemini_key,
)

In [None]:

query_embedding = client.models.embed_content(
    model="gemini-embedding-001", contents="Explain how AI works in a few words", config=types.EmbedContentConfig(task_type="RETRIEVAL_QUERY") # RETRIEVAL_QUERY for query, RETRIEVAL_DOCUMENT for document
)

In [None]:
print(query_embedding.embeddings[0].values)

In [None]:
response = (
    supabase.table("Article_Entry")
    .select("*")
    .execute()
)

In [None]:
response.data

In [None]:
content_array=[]
for item in response.data:
    content_array.append(item["contents"])

# Now 'content_array' contains all the 'contents' from the records
print(content_array)

In [None]:
chunk_size = 50
content_chunks = [content_array[i:i + chunk_size] for i in range(0, len(content_array), chunk_size)]

In [None]:
len(content_chunks[4:])

In [None]:
import time
# document_embeddings = []
for chunk in content_chunks[3:]:
    print(len(chunk))
    document_embedding = client.models.embed_content(
        model="gemini-embedding-001", contents=chunk, config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT", output_dimensionality=768) # RETRIEVAL_QUERY for query, RETRIEVAL_DOCUMENT for document
    )
    document_embeddings.extend(document_embedding.embeddings)
    time.sleep(70)

In [None]:
len(document_embeddings)

In [None]:
document_embeddings[0].values

In [None]:
records=[]
for i, emb in enumerate(document_embeddings):
    record = ()
    # Get the doc_id from the list (or however you map it)
    index = i+1
    
    # Convert the embedding to a list if it's a numpy array
    record = (index, emb.values, {"art_num": response.data[i]["art_num"],
                                      "type": response.data[i]["type"],
                                      "belongs_to": response.data[i]["belongs_to"],
                                      "ent_id": response.data[i]["ent_id"],
                                      "contents": response.data[i]["contents"],
                                      "word": response.data[i]["word"],})

    records.append(record)

In [None]:
print(len(emb.values))

In [None]:
import vecs
# DB_CONNECTION = "postgresql://postgres:Ga#v?Wfx!#9r*M8@db.zxusqtxqldvonqauboxw.supabase.co:5432/postgres"
DB_CONNECTION = f"postgresql://postgres.{ref}:{password}@aws-1-ap-southeast-1.pooler.supabase.com:6543/postgres"
# create vector store client
vx = vecs.create_client(DB_CONNECTION)

In [None]:
# Tell vecs to look in the 'public' schema for the 'Article_Entry' table
docs = vx.get_or_create_collection(name="Article_Entry", dimension=768)

In [None]:
docs.upsert(
    records=records
)

# Initialise Embeddings for chunks

In [4]:
response = (
    supabase.table("case_chunks")
    .select("*")
    .execute()
)

In [11]:
content_array=[]
for item in response.data:
    content_array.append(item["text"])

# Now 'content_array' contains all the 'contents' from the records
print(content_array)

['Snapchat settled FTC charges that it deceived consumers by promising that messages would “disappear forever” when in fact recipients could save messages using third‑party applications or by taking screenshots without notification. The 2014 FTC press release explains that Snapchat also collected user location data and stored video content unencrypted on its servers. It misrepresented its data collection practices and failed to secure its Find Friends feature, leading to the theft of 4.6 million user names and phone numbers. Under the settlement, Snapchat agreed not to misrepresent how it maintains the privacy, security or confidentiality of users’ information, implement a comprehensive privacy program, and obtain biennial independent assessments for 20 yearshttps://www.ftc.gov/news-events/news/press-releases/2014/05/snapchat-settles-ftc-charges-promises-disappearing-messages-were-false#:~:text=Snapchat%20Settles%20FTC%20Charges%20That,of%20Disappearing%20Messages%20Were%20False.', 'Fa

In [12]:
chunk_size = 50
content_chunks = [content_array[i:i + chunk_size] for i in range(0, len(content_array), chunk_size)]

In [15]:
print(len(content_chunks))

2


In [18]:
content_chunks[1]

['In November\xa02024, the European Commission fined Meta €797.72\xa0million for tying Facebook Marketplace to the social network and imposing unfair trading conditions on competing classified advertising services. According to the Commission’s press release, Meta embedded the Marketplace as a feature of Facebook, automatically giving it access to the social network’s user base and data. This practice constituted an abusive tying and self‑preferencing under Article\xa0102 of the Treaty on the Functioning of the European Union (TFEU). Meta also imposed unfair trading conditions on competitors that use Facebook for business, which hindered competition. The Commission ordered Meta to stop the conduct and align its practices with EU antitrust rules.',
 'A June\xa02023 Electronic Frontier Foundation article summarised the FTC’s settlement with Ring, a home‑security camera company owned by Amazon. The FTC alleged that Ring gave employees and contractors unrestricted access to customers’ priv

In [19]:
import time
document_embeddings = []
for chunk in content_chunks:
    print(len(chunk))
    document_embedding = client.models.embed_content(
        model="gemini-embedding-001", contents=chunk, config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT", output_dimensionality=768) # RETRIEVAL_QUERY for query, RETRIEVAL_DOCUMENT for document
    )
    document_embeddings.extend(document_embedding.embeddings)
    time.sleep(70)

50
2


In [None]:
records=[]
for i, emb in enumerate(document_embeddings):
    record = ()
    # Get the doc_id from the list (or however you map it)
    index = i+1
    
    # Convert the embedding to a list if it's a numpy array
    record = (index, emb.values, {"art_num": response.data[i]["art_num"],
                                      "type": response.data[i]["type"],
                                      "belongs_to": response.data[i]["belongs_to"],
                                      "ent_id": response.data[i]["ent_id"],
                                      "contents": response.data[i]["contents"],
                                      "word": response.data[i]["word"],})

    records.append(record)

In [25]:
response.data

[{'id': '0f2179e9-72d8-4bf2-9650-3adbd4dbe089',
  'law': 'FTC Act – deceptive practices and privacy misrepresentations',
  'company': 'Snapchat',
  'link': 'https://www.ftc.gov/news-events/press-releases/2014/05/snapchat-settles-ftc-charges-promised-dissapearing-messages-deceived-users',
  'doc_id': '5',
  'chunk_id': 0,
  'text': 'Snapchat settled FTC charges that it deceived consumers by promising that messages would “disappear forever” when in fact recipients could save messages using third‑party applications or by taking screenshots without notification. The 2014 FTC press release explains that Snapchat also collected user location data and stored video content unencrypted on its servers. It misrepresented its data collection practices and failed to secure its Find Friends feature, leading to the theft of 4.6 million user names and phone numbers. Under the settlement, Snapchat agreed not to misrepresent how it maintains the privacy, security or confidentiality of users’ information

In [27]:
for index, (embedding, item) in enumerate(zip(document_embeddings, response.data)):
    response = (
        supabase.table("case_chunks")
        .update({"embedding": embedding.values})
        .eq("id", item['id'])
        .execute()
    )

# Old Draft Code

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")


In [None]:
encoded_input = tokenizer("Hello", truncation=True, padding=True, return_tensors='pt')

# Get embeddings from the model
with torch.no_grad():
    query_output = model(**encoded_input)

'art_num': '13-63-101(2)',
  'type': 'Definition',
  'belongs_to': 'S.B. 152 (2023)',
  'ent_id': 2,
  'contents': '"Director" – the director of the Division of Consumer Protection.',
  'word': 'Director',
  'embedding'

In [None]:
response.data[0]["art_num"]

In [None]:
records=[]
for i, emb in enumerate(output[1]):
    record = ()
    # Get the doc_id from the list (or however you map it)
    index = i+1
    
    # Convert the embedding to a list if it's a numpy array
    embedding_list = emb.tolist()
    record = (index, embedding_list, {"art_num": response.data[i]["art_num"],
                                      "type": response.data[i]["type"],
                                      "belongs_to": response.data[i]["belongs_to"],
                                      "ent_id": response.data[i]["ent_id"],
                                      "contents": response.data[i]["contents"],
                                      "word": response.data[i]["word"],})

    records.append(record)

In [None]:
import vecs
# DB_CONNECTION = "postgresql://postgres:Ga#v?Wfx!#9r*M8@db.zxusqtxqldvonqauboxw.supabase.co:5432/postgres"
DB_CONNECTION = f"postgresql://postgres.{ref}:{password}@aws-1-ap-southeast-1.pooler.supabase.com:6543/postgres"
# create vector store client
vx = vecs.create_client(DB_CONNECTION)

In [None]:
# Tell vecs to look in the 'public' schema for the 'Article_Entry' table
docs = vx.get_or_create_collection(name="Article_Entry", dimension=768)

## Do not run two cells below :)

In [None]:
docs.upsert(
    records=records
)

In [None]:
docs.create_index()

In [None]:
index = docs.query(
    data=query_output[1][0].tolist(),              # required
    limit=3,                         # number of records to return
    # filters={"year": {"$eq": 2012}}, # metadata filters
)

In [None]:
docs['1']

# Add span texts to document

In [4]:
response = (
    supabase.table("Document")
    .select("*")
    .execute()
)

In [12]:
for doc in response.data:
    content = doc["content"]
    split_doc_content = content.strip().split("\n")
    clean_split_doc_content = ""
    for i in range(0, len(split_doc_content)):
        clean_split_doc_content += f"<span{i}>{split_doc_content[i]}</span{i}>"
    response = (
        supabase.table("Document")
        .update({"content_span": clean_split_doc_content})
        .eq("doc_id", doc["doc_id"])
        .execute()
    )

In [None]:
for index, dict_content in enumerate(response.data):
    split_content = dict_content["content"].strip().split("\n")

    clean_split_content = ""
    for i in range(0, len(split_content)):
        clean_split_content += f"<span{i}>{split_content[i]}</span{i}>"
    response = (
        supabase.table("Document")
        .update({"content_span": clean_split_content})
        .eq("doc_id", index+1)
        .execute()
    )

In [None]:
response.data[0]["content"]