### Using RetrieveChat Powered by PGVector for Question Answering - ragproxyagent

https://microsoft.github.io/autogen/0.2/docs/notebooks/agentchat_RetrieveChat_pgvector/

In [20]:
from dotenv import load_dotenv
load_dotenv() 
import numpy as np
import pandas as pd
import json
import os
import openai
import time
from openai import AzureOpenAI
import chromadb
import psycopg
from sentence_transformers import SentenceTransformer
import autogen
from autogen import AssistantAgent
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from autogen.retrieve_utils import TEXT_FORMATS
import psycopg2
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector

Reference: https://microsoft.github.io/autogen/0.2/docs/reference/agentchat/contrib/vectordb/pgvectordb

In [21]:
llm_config = {
    "config_list": [
        {
            "model": "gpt-4o",
            "api_key": os.getenv("OPENAI_API_KEY"),
            "api_type": "azure",
            "base_url": os.getenv("OPENAI_API_BASE"),
            "api_version": os.getenv("API_VERSION"),
        },
    ],
    "temperature": 0,
    "timeout": 300,
}

In [22]:
embed = SentenceTransformer("all-MiniLM-L6-v2").encode

In [4]:
conn = psycopg2.connect(
    dbname='test',
    user='postgres',
    password='test_password',
    host='localhost',
    port='5433'
)

cur = conn.cursor()

cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';")
tables = cur.fetchall()

for table in tables:
    print(table)

for table in tables:
    table_name = table[0]  
    print(f"Dropping table: {table_name}") 
    
    cur.execute(f"DROP TABLE IF EXISTS {table_name} CASCADE;")
    conn.commit() 

cur.close()
conn.close()

print("All tables in 'public' schema have been dropped.")

('policies',)
Dropping table: policies
All tables in 'public' schema have been dropped.


### data

In [5]:
data = {
    "id": ['442515', '752565', '742142', '741265', '147785'],
    "documents": [
        'All new drivers must complete a state-approved education course that covers traffic laws, signs, and safe driving practices.',
        'Driver must complete a minimum of 10 hours of behind-the-wheel driving instruction with a licensed instructor.',
        'Drivers over 65 must complete a refresher course to renew their driver license, focusing on changes in road safety and laws.',
        'Drivers seeking an advanced license must complete a defensive driving course to improve skills in handling challenging road conditions.',
        'Teenagers between the ages of 16-18 are required to complete a state-certified driver safety course before obtaining a license.'
    ],
    "metadatas": [
        'Driver Education Course Requirement', 
        'Behind-the-Wheel Training', 
        'License Renewal Training', 
        'Advanced Driving Course', 
        'Teen Driver Safety Training'
    ]
}

df = pd.DataFrame(data)

df['embedding'] = df["documents"].apply(lambda x: embed(x)) 

display(df.head())

Unnamed: 0,id,documents,metadatas,embedding
0,442515,All new drivers must complete a state-approved...,Driver Education Course Requirement,"[0.05125157, -0.03314669, 0.06757845, 0.006306..."
1,752565,Driver must complete a minimum of 10 hours of ...,Behind-the-Wheel Training,"[0.047425624, 0.016006604, -0.014129242, 0.027..."
2,742142,Drivers over 65 must complete a refresher cour...,License Renewal Training,"[-0.01847234, -0.026594533, 0.026502257, 0.020..."
3,741265,Drivers seeking an advanced license must compl...,Advanced Driving Course,"[-0.0005271046, 0.0036089937, 0.0009753169, 0...."
4,147785,Teenagers between the ages of 16-18 are requir...,Teen Driver Safety Training,"[0.02511883, 0.06068096, -0.018803362, -0.0313..."


In [6]:
len(df["embedding"][0])

384

In [7]:
conn = psycopg2.connect(
    dbname='test',
    user='postgres',
    password='test_password',
    host='localhost',
    port='5433'
)

cur = conn.cursor()

cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
conn.commit()

data_list = [
    (
        row['id'], 
        row['documents'], 
        row['metadatas'], 
        row['embedding'].tolist() if isinstance(row['embedding'], np.ndarray) else row['embedding']
    )
    for index, row in df.iterrows()
]

create_table_query = """
CREATE TABLE IF NOT EXISTS policies (
    id TEXT PRIMARY KEY, 
    documents TEXT,
    metadatas TEXT,  
    embedding vector(384)  -- Adjust the size to match your model's embedding dimensions
);
"""

cur.execute(create_table_query)
conn.commit()

insert_query = """
INSERT INTO policies (id, documents, metadatas, embedding) 
VALUES %s
ON CONFLICT (id) DO UPDATE 
SET documents = EXCLUDED.documents,
    metadatas = EXCLUDED.metadatas,
    embedding = EXCLUDED.embedding;
"""

execute_values(cur, insert_query, data_list)

conn.commit()

cur.close()
conn.close()

print(f"{len(data_list)} rows inserted or updated successfully.")

5 rows inserted or updated successfully.


In [8]:
assistant = AssistantAgent(
    name="assistant",
    system_message="""You are a helpful assistant. You must always reply with some form of text.""",
    llm_config=llm_config,
)

conn = psycopg.connect(conninfo="postgresql://postgres:test_password@localhost:5433/test", autocommit=True)

ragproxyagent = RetrieveUserProxyAgent(
    name="ragproxyagent",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=1,
    retrieve_config={
        "task": "qa",
        "docs_path": None,
        "chunk_token_size": 100,
        "model": 'gpt-4o',
        "vector_db": "pgvector",  
        "collection_name": "policies",
        "db_config": {
            "conn": conn, 
        },
        "get_or_create": True,  # set to False if you don't want to reuse an existing collection
        "overwrite": False,  # set to True if you want to overwrite an existing collection
        "distance_threshold": 0.9,
        "embedding_function": embed, 
    },
    code_execution_config=False,  # set to False if you don't want to execute the code
)

In [9]:
qa_problem = "Who should complete a state-certified driver safety course"
chat_result = ragproxyagent.initiate_chat(assistant, message=ragproxyagent.message_generator, problem=qa_problem)

VectorDB returns doc_ids:  [['442515', '741265', '147785']]
[32mAdding content of doc 442515 to context.[0m
[32mAdding content of doc 741265 to context.[0m
[32mAdding content of doc 147785 to context.[0m
[33mragproxyagent[0m (to assistant):

You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the
context provided by the user.
If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.
You must give as short an answer as possible.

User's question is: Who should complete a state-certified driver safety course

Context is: All new drivers must complete a state-approved education course that covers traffic laws, signs, and safe driving practices.
Drivers seeking an advanced license must complete a defensive driving course to improve skills in handling challenging road conditions.
Teenagers between the ages of 16-18 are required to complete a state-certified driver safety course before

### Retrieve Augmented Code Generation

In [10]:
assistant.reset()

# given a problem, we use the ragproxyagent to generate a prompt to be sent to the assistant as the initial message.
# the assistant receives the message and generates a response. The response will be sent back to the ragproxyagent for processing.
# The conversation continues until the termination condition is met, in RetrieveChat, the termination condition when no human-in-loop is no code block detected.
# With human-in-loop, the conversation will continue until the user says "exit".
code_problem = "Any rules for Teenagers?"
chat_result = ragproxyagent.initiate_chat(
    assistant, message=ragproxyagent.message_generator, problem=code_problem, search_string="spark"
)

VectorDB returns doc_ids:  [[]]
[32mNo more context, will terminate.[0m
[33mragproxyagent[0m (to assistant):

TERMINATE

--------------------------------------------------------------------------------


### OpenAI Embedding

In [11]:
embed = SentenceTransformer("all-MiniLM-L6-v2").encode
print(type(embed))

embeddings = embed(["Bye","Hi"])
print(type(embeddings))
print(f"Embeddings shape: {embeddings.shape}")

<class 'method'>
<class 'numpy.ndarray'>
Embeddings shape: (2, 384)


In [12]:
# class EmbeddingModel:
#     def __init__(self):
#         self.client = AzureOpenAI(
#             api_key=os.getenv("OPENAI_API_KEY"),
#             api_version="2024-02-01",
#             azure_endpoint=os.getenv("OPENAI_API_BASE")
#         )

#     def embed_openai(self, text: str):
#         response = self.client.embeddings.create(
#             input=text,
#             model="text-embedding-ada-002"
#         )
        
#         embeddings = np.array([np.array(embedding.embedding) for embedding in response.data])
        
#         return embeddings


# def embed_openai(text):
#     model = EmbeddingModel() 
#     return model.embed_openai(text)

In [13]:
def my_embedding_factory():
    client = AzureOpenAI(
        api_key=os.getenv("OPENAI_API_KEY"),
        api_version="2024-02-01",
        azure_endpoint=os.getenv("OPENAI_API_BASE")
    )

    def my_embedding(text):
        response = client.embeddings.create(
            input=text,
            model="text-embedding-ada-002"  # Ensure this is the correct model
        )

        embeddings = [np.array(embedding.embedding) for embedding in response.data]
        
        for i, embedding in enumerate(embeddings):
            if embedding.shape[0] != 1536:
                raise ValueError(f"Embedding {i} has an incorrect dimension: {embedding.shape[0]} (expected 1536).")

        if len(embeddings) == 1:
            return embeddings[0]

        return np.array(embeddings)

    return my_embedding


my_embedding = my_embedding_factory()

In [14]:
def embed_openai(text: str):
    client = AzureOpenAI(
        api_key=os.getenv("OPENAI_API_KEY"),
        api_version="2024-02-01",
        azure_endpoint=os.getenv("OPENAI_API_BASE")
    )

    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002"
    )

    embeddings = [np.array(embedding.embedding) for embedding in response.data]

    if len(embeddings) == 1:
        return embeddings[0]

    return np.array(embeddings)

# def embed_openai(text):
#     client = AzureOpenAI(
#         api_key=os.getenv("OPENAI_API_KEY"),  
#         api_version="2024-02-01",
#         azure_endpoint=os.getenv("OPENAI_API_BASE")
#     )

#     response = client.embeddings.create(
#         input=text, 
#         model="text-embedding-ada-002"
#     )

#     embeddings = [np.array(embedding.embedding) for embedding in response.data]

#     if len(embeddings) == 1:
#         embeddings = np.expand_dims(embeddings[0], axis=0)  # Convert single embedding to 2D

#     return np.array(embeddings)

print(type(embed_openai))
embeddings = embed_openai(["Bye","Hi"])
print(type(embeddings))
print(f"Embeddings shape: {embeddings.shape}")

<class 'function'>
<class 'numpy.ndarray'>
Embeddings shape: (2, 1536)


In [15]:
conn = psycopg2.connect(
    dbname='test',
    user='postgres',
    password='test_password',
    host='localhost',
    port='5433'
)

cur = conn.cursor()

cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';")
tables = cur.fetchall()

for table in tables:
    print(table)

('policies',)


In [23]:
new_data = {
    "id": ['442515', '752565', '742142', '741265', '147785'],
    "documents": [
        'All new drivers must complete a state-approved education course that covers traffic laws, signs, and safe driving practices.',
        'Driver must complete a minimum of 10 hours of behind-the-wheel driving instruction with a licensed instructor.',
        'Drivers over 65 must complete a refresher course to renew their driver license, focusing on changes in road safety and laws.',
        'Drivers seeking an advanced license must complete a defensive driving course to improve skills in handling challenging road conditions.',
        'Teenagers between the ages of 16-18 are required to complete a state-certified driver safety course before obtaining a license.'
    ],
    "metadatas": [
        'Driver Education Course Requirement', 
        'Behind-the-Wheel Training', 
        'License Renewal Training', 
        'Advanced Driving Course', 
        'Teen Driver Safety Training'
    ]
}

df = pd.DataFrame(new_data)

df['embedding'] = df["documents"].apply(lambda x: embed_openai(x)) 

display(df.head())

Unnamed: 0,id,documents,metadatas,embedding
0,442515,All new drivers must complete a state-approved...,Driver Education Course Requirement,"[0.009563765488564968, -0.0038687754422426224,..."
1,752565,Driver must complete a minimum of 10 hours of ...,Behind-the-Wheel Training,"[-0.001858723466284573, -0.0006039661238901317..."
2,742142,Drivers over 65 must complete a refresher cour...,License Renewal Training,"[0.005323335062712431, -0.007420213893055916, ..."
3,741265,Drivers seeking an advanced license must compl...,Advanced Driving Course,"[-0.0024872892536222935, 0.004641841631382704,..."
4,147785,Teenagers between the ages of 16-18 are requir...,Teen Driver Safety Training,"[0.013322370126843452, -0.003426025155931711, ..."


In [24]:
conn = psycopg2.connect(
    dbname='test',
    user='postgres',
    password='test_password',
    host='localhost',
    port='5433'
)

cur = conn.cursor()

cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
conn.commit()

create_table_query = """
CREATE TABLE IF NOT EXISTS policies_new (
    id TEXT PRIMARY KEY, 
    documents TEXT,
    metadatas TEXT,  
    embedding vector(1536)
);
"""

cur.execute(create_table_query)
conn.commit()

data_list = [
    (
        row['id'],
        row['documents'],
        row['metadatas'],
        row['embedding'].tolist() if isinstance(row['embedding'], np.ndarray) else row['embedding']
    )
    for index, row in df.iterrows()
]

execute_values(cur, "INSERT INTO policies_new (id, documents, metadatas, embedding) VALUES %s", data_list)

conn.commit()

cur.close()
conn.close()

print(f"{len(data_list)} rows inserted or updated successfully.")

5 rows inserted or updated successfully.


In [18]:
assistant = AssistantAgent(
    name="assistant",
    system_message="""You are a helpful assistant. You must always reply with some form of text.""",
    llm_config=llm_config,
)

conn = psycopg.connect(conninfo="postgresql://postgres:test_password@localhost:5433/test", autocommit=True)

ragproxyagent = RetrieveUserProxyAgent(
    name="ragproxyagent",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=1,
    retrieve_config={
        "task": "qa",
        "docs_path": None,
        "chunk_token_size": 100,
        "model": 'gpt-4o',
        "vector_db": "pgvector",
        "collection_name": "policies_new",
        "db_config": {
            "conn": conn, 
        },
        "get_or_create": True,  # set to False if you don't want to reuse an existing collection
        "overwrite": True,  # set to True if you want to overwrite an existing collection
        "distance_threshold": 0.9,
        "embedding_function": my_embedding, 
    },
    code_execution_config=False,  # set to False if you don't want to execute the code
)

In [19]:
qa_problem = "Who should complete a state-certified driver safety course"
chat_result = ragproxyagent.initiate_chat(assistant, message=ragproxyagent.message_generator, problem=qa_problem)

TypeError: object of type 'numpy.float64' has no len()

Check Gerarfo's work to fix the error with PGVectorDB