## Load in all the things

In [3]:
import os
from supabase import create_client, Client
from dotenv import load_dotenv 
from llama_index.llms.gemini import Gemini
load_dotenv("../../secrets/.env.dev")

url = os.environ.get("SUPABASE_URL")
key = os.environ.get("SUPABASE_KEY")
password = os.environ.get("SUPABASE_PASSWORD")
ref = os.environ.get("SUPABASE_REF")
gemKey = os.environ.get("GEMINI_API_KEY")

supabase: Client = create_client(url, key)
gemini = Gemini(models="models/gemini-pro", api_key=gemKey)

  gemini = Gemini(models="models/gemini-pro", api_key=gemKey)


In [4]:
#Fetching functions
from llama_index.core import Document

def fetch_articleDocs():
    response = supabase.table("Article_Entry").select("*").execute()
    docs = [
        Document(
            text=article["contents"],
            metadata={
                "id": article["ent_id"],
                "art_num": article["art_num"],
                "belongs_to": article["belongs_to"]
            }
        )
        for article in response.data
    ]
    return docs

def fetch_document(doc_id):
    response = supabase.table("Document").select("*").eq("doc_id", doc_id).execute()
    return response.data[0]["content"]

articles = fetch_articleDocs()
print(len(articles))

385


In [5]:
# Create embedings
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding

emb_model = GoogleGenAIEmbedding(
    model="models/embedding-004", 
    api_key=os.environ["GEMINI_API_KEY"]
)

docs = fetch_articleDocs()
index = VectorStoreIndex.from_documents(docs, embed_model=emb_model)

In [None]:
# Set up Hyde querying
import json
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine import TransformQueryEngine

query_engine = index.as_query_engine(
    llm=gemini,
    similarity_top_k=3,
    )
hyde = HyDEQueryTransform(include_original=True, llm=gemini)
hyde_query_engine = TransformQueryEngine(query_engine, hyde)

# Function for finding laws
def find_articles(document):
    query = (
        f"Are any of the legal articles relevant to the feature desribed in the following document. {document}"
        "If so, which articles."
    )

    response = hyde_query_engine.retrieve(query)
    articles = [{"article_id": node.node.metadata.get("id"),
            "art_num": node.node.metadata.get("art_num"),
            "belongs_to": node.node.metadata.get("belongs_to"),
            "score": node.score } for node in response]
    
    return json.dumps(articles)

In [14]:
## Testing
import pandas as pd

relevant_articles = find_articles(fetch_document(4))
print(relevant_articles)

def test_csv(num):
    df = pd.read_csv("synth_data.csv", on_bad_lines="skip") 

    # Example: combine first two columns into a single string per row
    df["document_text"] = df[df.columns[0]].astype(str) + " " + df[df.columns[1]].astype(str)

    # Pick one row to test
    test_document = df.iloc[num]["document_text"]

    # Run your RAG + HyDE function
    print(test_document)
    json_result = find_articles(test_document)
    print(json_result)

test_csv(0)

[{"article_id": 318, "art_num": "Article 27", "belongs_to": "EU Digital Service Act", "score": 0.6854756893658467}, {"article_id": 325, "art_num": "Article 34", "belongs_to": "EU Digital Service Act", "score": 0.6852535334107005}]
AI-Generated Recommendations Suggests posts based on engagement history and location.
[{"article_id": 318, "art_num": "Article 27", "belongs_to": "EU Digital Service Act", "score": 0.7006508623712223}, {"article_id": 194, "art_num": "Recital 70", "belongs_to": "EU Digital Service Act", "score": 0.6853889034352313}]
