In [129]:
import os
from supabase import create_client, Client
from dotenv import load_dotenv 
load_dotenv("../secrets/.env.dev")

url = os.environ.get("SUPABASE_URL")
key = os.environ.get("SUPABASE_KEY")
password = os.environ.get("SUPABASE_PASSWORD")
ref = os.environ.get("SUPABASE_REF")
supabase: Client = create_client(url, key)

In [79]:
response = (
    supabase.table("Article_Entry")
    .select("*")
    .execute()
)

In [96]:
response.data

[{'art_num': '13-63-101(2)',
  'type': 'Definition',
  'belongs_to': 'S.B. 152 (2023)',
  'ent_id': 2,
  'contents': '"Director" – the director of the Division of Consumer Protection.',
  'word': 'Director',
  'embedding': '[-0.7561798,0.2259241,1,0.009751376,0.1181319,0.18376109,-0.98773104,-0.41002706,0.99999994,0.9998763,0.99997824,-0.025147261,0.012507055,-0.9999997,0.17363448,0.36658904,0.48006225,0.32093075,0.06852302,0.9597816,0.3120043,-0.99348855,0.9999988,0.45591366,0.07001267,-0.9995557,-0.18305331,-0.99722403,-0.052726958,-0.911632,-0.2353898,-0.14394692,0.9999997,0.41381943,1,-0.26383173,-0.029413708,0.30407923,0.35471505,0.14395203,-0.04176424,-0.51676023,-0.9999855,-0.99999964,-0.050014593,0.1328071,0.99999994,-0.2580873,0.22280258,-0.99949497,-0.99999976,-1,-0.19648968,-0.17136113,-0.24099737,0.36081004,-0.37977937,-0.054962095,0.16303673,0.3249905,0.29386827,0.028442234,0.3698646,-0.5563766,-0.9999787,-1,0.40763244,0.2243373,0.44187853,-0.33842626,-0.41653803,-0.363264

In [80]:
content_array=[]
for item in response.data:
    content_array.append(item["contents"])

# Now 'content_array' contains all the 'contents' from the records
print(content_array)



In [115]:
from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")


In [117]:
encoded_input = tokenizer("Hello", truncation=True, padding=True, return_tensors='pt')

# Get embeddings from the model
with torch.no_grad():
    query_output = model(**encoded_input)

'art_num': '13-63-101(2)',
  'type': 'Definition',
  'belongs_to': 'S.B. 152 (2023)',
  'ent_id': 2,
  'contents': '"Director" – the director of the Division of Consumer Protection.',
  'word': 'Director',
  'embedding'

In [98]:
response.data[0]["art_num"]

'13-63-101(2)'

In [100]:
records=[]
for i, emb in enumerate(output[1]):
    record = ()
    # Get the doc_id from the list (or however you map it)
    index = i+1
    
    # Convert the embedding to a list if it's a numpy array
    embedding_list = emb.tolist()
    record = (index, embedding_list, {"art_num": response.data[i]["art_num"],
                                      "type": response.data[i]["type"],
                                      "belongs_to": response.data[i]["belongs_to"],
                                      "ent_id": response.data[i]["ent_id"],
                                      "contents": response.data[i]["contents"],
                                      "word": response.data[i]["word"],})

    records.append(record)

In [130]:
import vecs
# DB_CONNECTION = "postgresql://postgres:Ga#v?Wfx!#9r*M8@db.zxusqtxqldvonqauboxw.supabase.co:5432/postgres"
DB_CONNECTION = f"postgresql://postgres.{ref}:{password}@aws-1-ap-southeast-1.pooler.supabase.com:6543/postgres"
# create vector store client
vx = vecs.create_client(DB_CONNECTION)

In [131]:
# Tell vecs to look in the 'public' schema for the 'Article_Entry' table
docs = vx.get_or_create_collection(name="Article_Entry", dimension=768)

In [113]:
docs.upsert(
    records=records
)

In [114]:
docs.create_index()

In [132]:
docs.query(
    data=query_output[1][0].tolist(),              # required
    limit=1,                         # number of records to return
    # filters={"year": {"$eq": 2012}}, # metadata filters
)

['60']