In [1]:
# Installing all the required packages
!pip install llama-index llama-index-llms-bedrock llama-index-embeddings-bedrock langfuse



In [2]:
# Importing the required packages
import pandas as pd
from langfuse.llama_index import LlamaIndexCallbackHandler
from llama_index.core import Settings, SQLDatabase
from llama_index.core.callbacks import CallbackManager
from llama_index.core.query_engine import NLSQLTableQueryEngine
from llama_index.core.retrievers import NLSQLRetriever
from llama_index.llms.bedrock import Bedrock
from llama_index.embeddings.bedrock import BedrockEmbedding
from sqlalchemy import Boolean, Column, Float, Integer, String, create_engine
from sqlalchemy.orm import declarative_base

In [3]:
# Defining the configuration
REGION_NAME = "us-east-1"
CREDENTIALS_PROFILE_NAME = "MLEngineers"

PUBLIC_KEY = "pk-lf-c18a4846-e103-4db9-8739-328f70bb3b42" # e.g. "pk-1234567890abcdef"
SECRET_KEY = "sk-lf-a0258b40-6177-4c1e-93dd-243224f1b25d" # e.g. "sk-1234567890abcdef"
HOST = "https://cloud.langfuse.com"

EMBEDDER_MODEL_ID = "amazon.titan-embed-text-v2:0"
EMBEDDER_MODEL_KWARGS = {
    "dimensions": 512,
    "normalize": True
}

LLM_MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0" # anthropic.claude-3-haiku-20240307-v1:0 or anthropic.claude-3-sonnet-20240229-v1:0 or anthropic.claude-v2:1
LLM_MODEL_KWARGS = {
    "max_tokens": 4096,
    "temperature": 0.1,
    "top_p": 1,
    "top_k": 250,
    "stop_sequences": ["\n\nHuman"]
}

In [4]:
# Defining the ORM class for the database
Base = declarative_base()
class Population(Base):
    __tablename__ = 'transcriptions'
    
    studentID = Column(Integer, primary_key=True)
    age = Column(Integer)
    gender = Column(String)
    height = Column(Float)
    weight = Column(Float)
    bloodType = Column(String)
    bmi = Column(Float)
    temperature = Column(Float)
    heartRate = Column(Integer)
    bloodPressure = Column(Integer)
    cholesterol = Column(Integer)
    diabetes = Column(String)
    smoking = Column(String)

In [5]:
# Creating the database from .csv and storing it in memory
dataframe = pd.read_csv("hf://datasets/ayaalhawat/medical/Medical-Students-Performance-Dataset.csv")
dataframe = dataframe.dropna()
engine = create_engine('sqlite:///:memory:')
Base.metadata.create_all(engine)
dataframe.to_sql('transcriptions', con=engine, if_exists='replace', index=False)
sql_database = SQLDatabase(engine)

In [6]:
# Creating the embedder
embedder = BedrockEmbedding(
    model=EMBEDDER_MODEL_ID,
    model_kwargs=EMBEDDER_MODEL_KWARGS,
    region_name=REGION_NAME,
    #credentials_profile_name=CREDENTIALS_PROFILE_NAME
)

In [7]:
# Creating the LLM and Embedder models
llm = Bedrock(
    region_name=REGION_NAME,
    model=LLM_MODEL_ID,
    model_kwargs=LLM_MODEL_KWARGS)
    #credentials_profile_name=CREDENTIALS_PROFILE_NAME,)

In [8]:
# Creating the callback handler
langfuse_callback = LlamaIndexCallbackHandler(
    public_key=PUBLIC_KEY,
    secret_key=SECRET_KEY,
    host="https://cloud.langfuse.com"
)
Settings.callback_manager = CallbackManager([langfuse_callback])

In [9]:
# Creating the Natural language SQL Table query engine.
query_engine = NLSQLTableQueryEngine(
    sql_database=sql_database, tables=["transcriptions"], llm=llm, embed_model=embedder
)

In [10]:
langfuse_callback.auth_check()

True

In [11]:
# Testing the query engine - 1
query_str = "What is the age of the person with the highest cholesterol levels?"
response = query_engine.query(query_str)
print(response)

Based on the SQL query and result, the response to "What is the age of the person with the highest cholesterol levels?" is:

The age of the person with the highest cholesterol levels is 30.


In [12]:
# Testing the query engine - 2
query_str = "What is the gender that smokes more?"
response = query_engine.query(query_str)
print(response)

Based on the SQL query and its response, the gender that smokes more is Male. The query groups the records from the 'transcriptions' table by 'Gender' where 'Smoking' is 'Yes', counts the number of records for each gender, orders the results by the count in descending order, and takes the first (top) result. The result shows that there are 5085 records where 'Gender' is 'Male' and 'Smoking' is 'Yes', which is higher than the count for any other gender value.


In [13]:
# Testing the query engine - 2
query_str = "What is the average blood pressure for people with blood type AB?"
response = query_engine.query(query_str)
print(response)

Unfortunately, I cannot provide the average blood pressure for people with blood type AB because the SQL query you provided resulted in an error. The error message suggests that the statement is invalid SQL, likely due to an issue with the table or column names used in the query. Without access to the database schema or sample data, I cannot determine the correct way to query for the requested information. Please double-check the table and column names in your query and try again.


In [14]:
# Testing the query engine - 2
query_str = "What is the ID of the student with lowest temperature and has diabetes."
response = query_engine.query(query_str)
print(response)

The student with the lowest temperature who has diabetes has the ID 81091.


In [15]:
# Testing the query engine - 2
query_str = "What is the average BMI for women?"
response = query_engine.query(query_str)
print(response)

Based on the SQL query and result, the response to "What is the average BMI for women?" would be:

The average BMI for women is 23.34.


In [16]:
# Creating the Natural language SQL retriever
nl_sql_retriever = NLSQLRetriever(
    sql_database, tables=["transcriptions"], llm=llm, embed_model=embedder, return_raw=True
)

In [17]:
# Testing the retriever
results = nl_sql_retriever.retrieve(
    "Return me the top 5 ages of the people with the highest weight."
)
print(results)

[NodeWithScore(node=TextNode(id_='f24f31e7-cd29-4193-ad6a-4af89e48bcdb', embedding=None, metadata={'sql_query': 'SELECT Age\nFROM transcriptions\nORDER BY Weight DESC\nLIMIT 5;', 'result': [(31.0,), (26.0,), (26.0,), (19.0,), (18.0,)], 'col_keys': ['Age']}, excluded_embed_metadata_keys=['sql_query', 'result', 'col_keys'], excluded_llm_metadata_keys=['sql_query', 'result', 'col_keys'], relationships={}, text='[(31.0,), (26.0,), (26.0,), (19.0,), (18.0,)]', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=None)]


In [18]:
# Testing the retriever
results = nl_sql_retriever.retrieve(
    "Return me the ID and gender of the student with highest temperature, that smokes and has diabetes."
)
print(results)

[NodeWithScore(node=TextNode(id_='70c21e21-e231-46f7-a61c-1178d4abe077', embedding=None, metadata={'sql_query': 'SELECT "Student ID", Gender\nFROM transcriptions\nWHERE Smoking = \'Yes\' AND Diabetes = \'Yes\'\nORDER BY Temperature DESC\nLIMIT 1;', 'result': [(67082.0, 'Female')], 'col_keys': ['Student ID', 'Gender']}, excluded_embed_metadata_keys=['sql_query', 'result', 'col_keys'], excluded_llm_metadata_keys=['sql_query', 'result', 'col_keys'], relationships={}, text="[(67082.0, 'Female')]", mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=None)]
