# testing RAG with sql databases

In [1]:
import os 
from pyprojroot import here
import chromadb
from openai import AzureOpenAI
import warnings
import pandas as pd

warnings.filterwarnings('ignore')
from dotenv import load_dotenv

print(f'Load environment variables:{load_dotenv()}')


Load environment variables:True


In [2]:
azure_open_api_key=os.getenv('AZURE_OPENAI_API_KEY')
azure_openai_endpoint=os.getenv('AZURE_OpenAI_ENDPOINT')
azure_openai_api_version=os.getenv('AZURE_OpenAI_API_VERSION')

In [3]:
azure_client=AzureOpenAI(
    api_key=azure_open_api_key,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint
)

chroma_client=chromadb.PersistentClient(path=str(here('data/chroma')))

# create a collection of data injection

In [4]:
# collection = chroma_client.create_collection(name='titanic_small')
collection=chroma_client.create_collection(name='titanic_small2')


UniqueConstraintError: Collection titanic_small2 already exists

In [23]:
file_dir=here('data/for_upload/titanic_small.csv')
df=pd.read_csv(file_dir)

In [25]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


In [17]:
df.loc[df['Pclass']==3]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05
5,0,3,Mr. James Moran,male,27,0,0,8.4583
7,0,3,Master. Gosta Leonard Palsson,male,2,3,1,21.075
8,1,3,Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson,female,27,0,2,11.1333


# convert the SQL data to embeddings

In [10]:
docs=[]
metadatas=[]
ids=[]
embeddings=[]

azure_client=AzureOpenAI(
    api_key=azure_open_api_key,
    api_version=azure_openai_api_version,
    azure_endpoint=os.getenv('AZURE_OPENAI_EMBEDDINGS_ENDPOINT')
)

for index,row in df.iterrows():
    output_str=''
    # treat each row as a seperate chunk
    for col in df.columns:
        output_str+=f'{col}:{row[col]},\n'
    response=azure_client.embeddings.create(
        input=output_str,
        model='text-embedding-ada-002'
    )
    embeddings.append(response.data[0].embedding)
    docs.append(output_str)
    metadatas.append({'source':'titanic_small2'})
    ids.append(f'id{index}')


NameError: name 'df' is not defined

In [54]:
docs

['Survived:0,\nPclass:3,\nName:Mr. Owen Harris Braund,\nSex:male,\nAge:22,\nSiblings/Spouses Aboard:1,\nParents/Children Aboard:0,\nFare:7.25,\n',
 'Survived:1,\nPclass:1,\nName:Mrs. John Bradley (Florence Briggs Thayer) Cumings,\nSex:female,\nAge:38,\nSiblings/Spouses Aboard:1,\nParents/Children Aboard:0,\nFare:71.2833,\n',
 'Survived:1,\nPclass:3,\nName:Miss. Laina Heikkinen,\nSex:female,\nAge:26,\nSiblings/Spouses Aboard:0,\nParents/Children Aboard:0,\nFare:7.925,\n',
 'Survived:1,\nPclass:1,\nName:Mrs. Jacques Heath (Lily May Peel) Futrelle,\nSex:female,\nAge:35,\nSiblings/Spouses Aboard:1,\nParents/Children Aboard:0,\nFare:53.1,\n',
 'Survived:0,\nPclass:3,\nName:Mr. William Henry Allen,\nSex:male,\nAge:35,\nSiblings/Spouses Aboard:0,\nParents/Children Aboard:0,\nFare:8.05,\n',
 'Survived:0,\nPclass:3,\nName:Mr. James Moran,\nSex:male,\nAge:27,\nSiblings/Spouses Aboard:0,\nParents/Children Aboard:0,\nFare:8.4583,\n',
 'Survived:0,\nPclass:1,\nName:Mr. Timothy J McCarthy,\nSex:male

In [55]:
print(metadatas)
print(ids)

[{'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}, {'source': 'titanic_small2'}]
['id0', 'id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11', 'id12', 'id1

In [56]:
embeddings[0][:10]

[-0.005806858651340008,
 -0.022392014041543007,
 -0.017091887071728706,
 -0.025856956839561462,
 0.004077811725437641,
 0.036484602838754654,
 -0.014421279542148113,
 -0.0008816428016871214,
 -0.02277548611164093,
 -0.018639469519257545]

In [57]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

In [58]:
print(collection.count())

30


# RAG operations

In [5]:
from openai import AzureOpenAI


In [11]:
model_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_MODEL')
azure_openai_api_key=os.getenv('AZURE_OPENAI_API_KEY')
azure_openai_endpoint=os.getenv('AZURE_OpenAI_ENDPOINT')
azure_client=AzureOpenAI(
    api_key=azure_open_api_key,
    api_version=azure_openai_api_version,
    azure_endpoint=os.getenv('AZURE_OPENAI_EMBEDDINGS_ENDPOINT')
)
llm=AzureOpenAI(
    api_version=os.getenv('AZURE_OpenAI_API_VERSION'),
    azure_endpoint=azure_openai_endpoint,
    api_key=os.getenv('AZURE_OPENAI_API_KEY')
)

In [12]:
query_texts='what is the maximum age of a male survivor'
response=azure_client.embeddings.create(
    input=query_texts,
    model='text-embedding-ada-002'
)
query_embeddings=response.data[0].embedding


# load the chromaDB collection for vector search

In [13]:
vectordb=chroma_client.get_collection(name='titanic_small2')

In [14]:
result=vectordb.query(
    query_embeddings=query_embeddings,
    n_results=1
)
result

{'ids': [['id23']],
 'distances': [[0.47334641130996247]],
 'metadatas': [[{'source': 'titanic_small2'}]],
 'embeddings': None,
 'documents': [['Survived:1,\nPclass:1,\nName:Mr. William Thompson Sloper,\nSex:male,\nAge:28,\nSiblings/Spouses Aboard:0,\nParents/Children Aboard:0,\nFare:35.5,\n']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [15]:
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {result}"

message=[
    {
        'role':'system','content':str(system_role)
    },
    {
        'role':'user','content':prompt
    }
]

In [16]:
response=llm.chat.completions.create(
    model=os.getenv('AZURE_OPENAI_DEPLOYMENT_MODEL'),
    messages=message
)


In [17]:
response.choices[0].message.content

'The maximum age of a male survivor is not specified in the provided search results.'

# fact check

- below response is totally incorrect
- therefore we choose SQL agent for this scenario

In [89]:
print(df.loc[(df['Survived']==1) & (df['Age']==max(df['Age']))])

    Survived  Pclass                     Name     Sex  Age  \
11         1       1  Miss. Elizabeth Bonnell  female   58   

    Siblings/Spouses Aboard  Parents/Children Aboard   Fare  
11                        0                        0  26.55  
