### PGVector RAG Tool for Question Answering - RAGwithAutogenTools

https://microsoft.github.io/autogen/0.2/docs/notebooks/agentchat_RetrieveChat_pgvector/

In [1]:
from dotenv import load_dotenv
load_dotenv() 
import numpy as np
import pandas as pd
import json
import os
import openai
import time
from openai import AzureOpenAI
import chromadb
import psycopg
from sentence_transformers import SentenceTransformer
import autogen
from autogen import AssistantAgent
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from autogen import ConversableAgent, UserProxyAgent, config_list_from_json
from autogen.retrieve_utils import TEXT_FORMATS
import psycopg2
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector
from autogen import AssistantAgent, ConversableAgent, UserProxyAgent

  from .autonotebook import tqdm as notebook_tqdm
flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.


Reference: https://microsoft.github.io/autogen/0.2/docs/reference/agentchat/contrib/vectordb/pgvectordb

In [2]:
llm_config = {
    "config_list": [
        {
            "model": "gpt-4o",
            "api_key": os.getenv("OPENAI_API_KEY"),
            "api_type": "azure",
            "base_url": os.getenv("OPENAI_API_BASE"),
            "api_version": os.getenv("API_VERSION"),
        },
    ],
    "temperature": 0.0,
    "timeout": 300,
}

In [3]:
def embed_openai(text: str):
    client = AzureOpenAI(
        api_key=os.getenv("OPENAI_API_KEY"),
        api_version="2024-02-01",
        azure_endpoint=os.getenv("OPENAI_API_BASE")
    )

    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002"
    )

    embeddings = [np.array(embedding.embedding) for embedding in response.data]

    if len(embeddings) == 1:
        return embeddings[0]

    return np.array(embeddings)

In [4]:
conn = psycopg2.connect(
    dbname='test',
    user='postgres',
    password='test_password',
    host='localhost',
    port='5433'
)

cur = conn.cursor()

cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';")
tables = cur.fetchall()

for table in tables:
    print(table)

for table in tables:
    table_name = table[0]  
    print(f"Dropping table: {table_name}") 
    
    cur.execute(f"DROP TABLE IF EXISTS {table_name} CASCADE;")
    conn.commit() 

cur.close()
conn.close()

print("All tables in 'public' schema have been dropped.")

('policies',)
Dropping table: policies
All tables in 'public' schema have been dropped.


### data

In [5]:
data = {
    "id": ['442515', '752565', '742142', '741265', '147785'],
    "documents": [
        'All new drivers must complete a state-approved education course that covers traffic laws, signs, and safe driving practices.',
        'Driver must complete a minimum of 10 hours of behind-the-wheel driving instruction with a licensed instructor.',
        'Drivers over 65 must complete a refresher course to renew their driver license, focusing on changes in road safety and laws.',
        'Drivers seeking an advanced license must complete a defensive driving course to improve skills in handling challenging road conditions.',
        'Teenagers between the ages of 16-18 are required to complete a state-certified driver safety course before obtaining a license.'
    ],
    "metadatas": [
        'Driver Education Course Requirement', 
        'Behind-the-Wheel Training', 
        'License Renewal Training', 
        'Advanced Driving Course', 
        'Teen Driver Safety Training'
    ]
}

df = pd.DataFrame(data)

df['embedding'] = df["documents"].apply(lambda x: embed_openai(x)) 

display(df.head())

Unnamed: 0,id,documents,metadatas,embedding
0,442515,All new drivers must complete a state-approved...,Driver Education Course Requirement,"[0.009563765488564968, -0.0038687754422426224,..."
1,752565,Driver must complete a minimum of 10 hours of ...,Behind-the-Wheel Training,"[-0.001858723466284573, -0.0006039661238901317..."
2,742142,Drivers over 65 must complete a refresher cour...,License Renewal Training,"[0.005323335062712431, -0.007420213893055916, ..."
3,741265,Drivers seeking an advanced license must compl...,Advanced Driving Course,"[-0.0024872892536222935, 0.004641841631382704,..."
4,147785,Teenagers between the ages of 16-18 are requir...,Teen Driver Safety Training,"[0.013322370126843452, -0.003426025155931711, ..."


In [6]:
conn = psycopg2.connect(
    dbname='test',
    user='postgres',
    password='test_password',
    host='localhost',
    port='5433'
)

cur = conn.cursor()

cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
conn.commit()

data_list = [
    (
        row['id'], 
        row['documents'], 
        row['metadatas'], 
        row['embedding'].tolist() if isinstance(row['embedding'], np.ndarray) else row['embedding']
    )
    for index, row in df.iterrows()
]

create_table_query = """
CREATE TABLE IF NOT EXISTS policies (
    id TEXT PRIMARY KEY, 
    documents TEXT,
    metadatas TEXT,  
    embedding vector(1536)  -- Adjust the size to match your model's embedding dimensions
);
"""

cur.execute(create_table_query)
conn.commit()

insert_query = """
INSERT INTO policies (id, documents, metadatas, embedding) 
VALUES %s
ON CONFLICT (id) DO UPDATE 
SET documents = EXCLUDED.documents,
    metadatas = EXCLUDED.metadatas,
    embedding = EXCLUDED.embedding;
"""

execute_values(cur, insert_query, data_list)

conn.commit()

cur.close()
conn.close()

print(f"{len(data_list)} rows inserted or updated successfully.")

5 rows inserted or updated successfully.


In [7]:
def get_closest_doc(text: str) -> str:
    client = AzureOpenAI(
        api_key=os.getenv("OPENAI_API_KEY"),
        api_version="2024-02-01",
        azure_endpoint=os.getenv("OPENAI_API_BASE")
    )

    embedding_response = client.embeddings.create(
        input=text, model="text-embedding-ada-002"
    )
    embedding = json.loads(embedding_response.model_dump_json(indent=2))["data"][0]["embedding"]

    psql = psycopg2.connect(
        dbname=os.getenv("MYPOSTGRES_DATABASE"),
        user=os.getenv("MYPOSTGRES_USER"),
        password="test_password",
        host=os.getenv("MYPOSTGRES_HOST"),
        port=os.getenv("MYPOSTGRES_PORT")
    )

    register_vector(psql)

    conditional_query = """
        SELECT documents, metadatas
        FROM policies
        ORDER BY embedding <=> %s
        LIMIT 3;
    """

    cursor = psql.cursor()
    cursor.execute(conditional_query, (np.array(embedding),))
    top3_docs = cursor.fetchall()

    cursor.close()
    psql.close()

    numbered_docs = {str(i + 1): {'title': doc[1], 'content': doc[0]} for i, doc in enumerate(top3_docs)}

    return json.dumps(numbered_docs, indent=2)

In [8]:
print(get_closest_doc("Who should complete a state-certified driver safety course"))

{
  "1": {
    "title": "Teen Driver Safety Training",
    "content": "Teenagers between the ages of 16-18 are required to complete a state-certified driver safety course before obtaining a license."
  },
  "2": {
    "title": "Driver Education Course Requirement",
    "content": "All new drivers must complete a state-approved education course that covers traffic laws, signs, and safe driving practices."
  },
  "3": {
    "title": "Advanced Driving Course",
    "content": "Drivers seeking an advanced license must complete a defensive driving course to improve skills in handling challenging road conditions."
  }
}


### PGVector RAG Agent

In [9]:
policy_expert_message = " ".join("""
    You are a regulatory expert with access to a database of the policies.
    You are going to take a query, and use the tool get_closest_doc`
    which take a string argument, and return a list of strings with 3 documents
    of policies on the database. You are meant to format it as follows
    numbered_docs = {str(i + 1): {'title': doc[1], 'content': doc[0]} for i, doc in enumerate(top3_docs)}
    return json.dumps(numbered_docs, indent=2)
""".split())

policy_expert_agent = ConversableAgent(
    name="PolicyExpert",
    system_message=policy_expert_message,
    llm_config=llm_config,
    human_input_mode= "NEVER",
)

In [10]:
user = ConversableAgent(
    name="user",
    system_message= "You are asking questions to your assistants and reporting back in a detailed tone.",
    llm_config=llm_config,
    human_input_mode= "NEVER",
)

### Registering Tools

In [11]:
policy_expert_agent.register_for_llm(
    name="get_closest_doc", 
    description="Gets a list of documents of policies related to the user query."
)(get_closest_doc)

<function __main__.get_closest_doc(text: str) -> str>

In [12]:
user.register_for_execution(
        name="get_closest_doc", 
)(get_closest_doc)

<function __main__.get_closest_doc(text: str) -> str>

### User Query

In [13]:
user_query = "What policies apply to 16-year-olds?"

In [14]:
chat_results = user.initiate_chats(
    [
        {
            "recipient": policy_expert_agent,
            "message": user_query,
            "max_turns": 3,
            "summary_method": "last_msg",
        }
    ]
)

[34m
********************************************************************************[0m
[34mStarting a new chat....[0m
[34m
********************************************************************************[0m
[33muser[0m (to PolicyExpert):

What policies apply to 16-year-olds?

--------------------------------------------------------------------------------
[33mPolicyExpert[0m (to user):

[32m***** Suggested tool call (call_TbLOBYsB2VDPWAtW6zDVjoYp): get_closest_doc *****[0m
Arguments: 
{"text":"policies for 16-year-olds"}
[32m********************************************************************************[0m

--------------------------------------------------------------------------------
[35m
>>>>>>>> EXECUTING FUNCTION get_closest_doc...[0m
[33muser[0m (to PolicyExpert):

[33muser[0m (to PolicyExpert):

[32m***** Response from calling tool (call_TbLOBYsB2VDPWAtW6zDVjoYp) *****[0m
{
  "1": {
    "title": "Teen Driver Safety Training",
    "content": "Teenagers b