### Load libraries

In [19]:
from dotenv import load_dotenv
import os

from langchain_cohere.chat_models import ChatCohere
from langchain.tools import Tool
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_cohere.react_multi_hop.agent import create_cohere_react_agent
from langchain_core.prompts import ChatPromptTemplate

import boto3
import pandas as pd

### Load environment variables
I created .env file where I stored different api keys (AWS and openai).

In [2]:
load_dotenv()

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_REGION = os.getenv("AWS_REGION")

# Set the cohere API key
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
os.environ["COHERE_API_KEY"] = COHERE_API_KEY


### LLM
I will use chatgpt 3.5 turbo as llm for this tutorial, You can use another llm models.\
[List of llm models that you can use with langchain](https://python.langchain.com/docs/integrations/chat/#all-chat-models)

In [3]:
# Create the OpenAI chat model
chat = ChatCohere(model="command-r-plus", temperature=0.2)

### Data
I will use an S3 cluster data for this tutorial, I will upload data and then push it to postgres database.

##### Read S3

In [4]:
os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"]=AWS_SECRET_ACCESS_KEY
# os.environ["AWS_REGION"]= AWS_REGION

# Initialize S3 client
s3_client = boto3.client("s3")

# Function to list files in an S3 bucket
def list_s3_objects(bucket_name):
    response = s3_client.list_objects_v2(Bucket=bucket_name)
    if "Contents" in response:
        return [obj["Key"] for obj in response["Contents"]]
    return "No files found in the bucket."

# Function to read a file from S3
def read_s3_object(bucket_name, file_key):
    response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
    # content = response["Body"].read().decode("utf-8")
    content = pd.read_csv(response["Body"])
    return content


In [5]:
list_s3_objects("earthquakedb")

['data_etl.csv']

In [6]:
df = read_s3_object("earthquakedb", "data_etl.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,Year,Local_Time,UTC_Time,Magnitude,Location,Depth,Latitude,Longitude,Event_type,Assessment,Agency
0,0,2025,Jan 26 14:48:47,Jan 26 13:48:47,1.3,Martigny VS,4.9,46.16,7.06,,automatic,SED
1,1,2025,Jan 26 09:36:00,Jan 26 08:36:00,1.2,Liestal BL,10.6,47.48,7.74,,automatic,SED
2,2,2025,Jan 26 05:53:30,Jan 26 04:53:30,1.3,Bosco/Gurin TI,5.0,46.3,8.34,,automatic,SED
3,3,2025,Jan 26 05:40:34,Jan 26 04:40:34,1.7,CHATEL-ST-DENIS FR,2.2,46.49,6.89,,automatic,SED
4,4,2025,Jan 25 23:06:29,Jan 25 22:06:29,1.6,Unterschaechen UR,8.5,46.9,8.8,,automatic,SED
5,5,2025,Jan 25 20:50:13,Jan 25 19:50:13,1.2,Aosta I,5.0,45.8,7.43,,automatic,SED
6,6,2025,Jan 25 20:47:20,Jan 25 19:47:20,1.1,Laufenburg,7.9,47.7,7.96,,automatic,SED
7,7,2025,Jan 25 15:23:28,Jan 25 14:23:28,0.3,Eglisau ZH,1.7,47.57,8.54,earthquake,manual,SED
8,8,2025,Jan 25 15:23:22,Jan 25 14:23:22,0.2,Eglisau ZH,1.7,47.57,8.54,earthquake,manual,SED
9,9,2025,Jan 24 16:09:32,Jan 24 15:09:32,1.3,Bourg-Saint-Pierre VS,6.9,45.91,7.04,earthquake,manual,SED


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8747 entries, 0 to 8746
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8747 non-null   int64  
 1   Year        8747 non-null   int64  
 2   Local_Time  8747 non-null   object 
 3   UTC_Time    8747 non-null   object 
 4   Magnitude   8747 non-null   float64
 5   Location    8747 non-null   object 
 6   Depth       8747 non-null   float64
 7   Latitude    8747 non-null   float64
 8   Longitude   8747 non-null   float64
 9   Event_type  8740 non-null   object 
 10  Assessment  8747 non-null   object 
 11  Agency      8747 non-null   object 
dtypes: float64(4), int64(2), object(6)
memory usage: 820.2+ KB


##### Load S3 data into postgres table

#### Create database

In [8]:
import pandas as pd
from sqlalchemy import create_engine
import psycopg2

# PostgreSQL credentials
db_host = "localhost"
db_user = "postgres"
db_pass = "postgres"
db_port = "5432"  # Default PostgreSQL port
db_name = "llm_sql"

In [9]:
# Create database
try:
    conn = psycopg2.connect(
        dbname="postgres", user=db_user, password=db_pass, host=db_host, port=db_port
    )
    conn.autocommit = True
    cursor = conn.cursor()
    cursor.execute(f"CREATE DATABASE {db_name};")
    cursor.close()
    conn.close()
    print(f"Database '{db_name}' created successfully.")
except psycopg2.errors.DuplicateDatabase:
    print(f"Database '{db_name}' already exists.")
except Exception as e:
    print(f"Error creating database: {e}")


Database 'llm_sql' already exists.


##### Create table and push data

In [10]:
from sqlalchemy import create_engine, Column, Integer, String, Float, MetaData, Table, DateTime
from sqlalchemy.exc import SQLAlchemyError

# Create a PostgreSQL engine
engine = create_engine(f'postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}')

In [14]:
from sqlalchemy.exc import NoSuchTableError

metadata_obj = MetaData()
table_name = "earthquakes"

try:
    table = Table(table_name, metadata_obj, autoload_with=engine)
    table.drop(engine)
    print(f"Table '{table_name}' deleted successfully.")
except NoSuchTableError:
    print(f"Table '{table_name}' does not exist.")

Table 'earthquakes' does not exist.


In [15]:

# Sample DataFrame
data = read_s3_object("earthquakedb", "data_etl.csv")
data = data.rename(columns={"Unnamed: 0": "Index",
                            "Location": "location",
                            "Magnitude": "magnitude",
                            "Latitude": "latitude",
                            "Longitude": "longitude"})
data = data[["Index", "Year", "Local_Time", "magnitude", "location", "latitude", "longitude"]]
data = data[(data["magnitude"]>=2) & (data["Year"]>=2020)]

data["local_Time"] = pd.to_datetime(data["Year"].astype(str) + " " + data["Local_Time"],
                                    format="%Y %b %d %H:%M:%S")
data.drop(columns=["Year", "Local_Time"], inplace=True)

# Define your table schema
earthquake_table = Table(
    "earthquakes", metadata_obj,
    Column("Index", Integer, primary_key=True, autoincrement=True),
    Column("local_time", DateTime),
    Column("magnitude", Float),
    Column("location", String(255)),
    Column("latitude", Float),
    Column("longitude", Float)
)

# Create the table if it does not exist
try:
    metadata_obj.create_all(engine)
    print("Table created (if not exists).")
except SQLAlchemyError as e:
    print(f"Error creating table: {e}")

# Push the DataFrame into PostgreSQL (replace table if it exists)
data.to_sql('earthquakes', engine, if_exists='replace', index=False)

print("Data successfully pushed to PostgreSQL!")

Table created (if not exists).
Data successfully pushed to PostgreSQL!


##### Test
To test, I am retrieving first 10 records.

In [16]:
query = "SELECT * FROM earthquakes LIMIT 10;"
pd.read_sql(query, engine)

Unnamed: 0,Index,magnitude,location,latitude,longitude,local_Time
0,34,2.1,Sargans SG,47.08,9.51,2025-01-18 16:47:11
1,52,2.1,Ravensburg D,47.82,9.43,2025-01-15 05:24:59
2,53,2.3,Lausanne VD,46.51,6.74,2025-01-15 05:11:13
3,77,2.2,Santa Maria GR,46.66,10.52,2025-01-10 23:58:50
4,86,2.0,Chamonix F,45.9,7.01,2025-01-10 11:09:35
5,95,2.4,Freiburg im Breisgau D,48.0,7.91,2025-01-09 03:33:35
6,125,2.1,Samnaun GR,46.9,10.31,2025-01-06 15:23:19
7,127,2.3,Cevio TI,46.34,8.55,2025-01-06 09:56:49
8,131,2.2,Kreuzlingen TG,47.63,9.21,2025-01-05 16:54:55
9,136,2.0,Savognin GR,46.59,9.57,2025-01-03 15:29:42


### SQL Agent

In [17]:
from langchain_community.agent_toolkits import SQLDatabaseToolkit
from langchain_community.utilities import SQLDatabase
from langchain.agents import create_sql_agent, AgentExecutor
from langchain.agents.agent_types import AgentType

db = SQLDatabase.from_uri(f'postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}')

# Create a SQLDatabase Toolkit (this contains the necessary tools)
toolkit = SQLDatabaseToolkit(db=db, llm=chat)


In [20]:
tools = toolkit.get_tools()
print([tool.name for tool in tools])

['sql_db_query', 'sql_db_schema', 'sql_db_list_tables', 'sql_db_query_checker']


In [22]:
# Define prompt
# prompt = ChatPromptTemplate.from_template("{input}")
prompt = ChatPromptTemplate.from_template(
    "{input}\n\nPrevious Actions:\n{agent_scratchpad}"
)

# Create LangChain SQL agent
sql_agent = create_tool_calling_agent(
    llm=chat,
    tools=tools,
    prompt=prompt,
)

# Create agent executor
agent_executor = AgentExecutor(agent=sql_agent,
                               tools=tools,
                               verbose=False)

### Simple questions

- 10 first records

In [23]:
response1 = agent_executor.invoke({"input": "Display the first 10 records from the earthquakes table."})
print(response1['output'])

TooManyRequestsError: status_code: 429, body: {'message': "You are using a Trial key, which is limited to 10 API calls / minute. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}

- strongest earthquake recorded

In [50]:
response2 = agent_executor.invoke({"input": "Retrieve the record of the earthquake of the most magnitude"})
print(response2['output'])

The earthquake with the highest magnitude recorded in the database occurred in Mulhouse F with a magnitude of 4.7. The coordinates for the earthquake are 47.68 latitude and 7.47 longitude. The earthquake occurred on September 10, 2022, at 17:58:13.


In [52]:
response3 = agent_executor.invoke({"input": "when and where occurs the strongest earthquake's magnitude."})
print(response3['output'])

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

- Oldest record

In [21]:
response4 = agent_executor.invoke({"input": "when was the oldest record in earthquake table."})
print(response4['output'])

The oldest record in the "earthquakes" table is from May 28, 2019 at 10:48:05.


- Region with most earthquakes

In [22]:
response5 = agent_executor.invoke({"input": "In which swiss Location occurs the most earthquakes."})
print(response5['output'])

The Swiss location where the most earthquakes occur is Elm GL with a count of 42 earthquakes.


In [23]:
response6 = agent_executor.invoke({"input": "In which swiss canton occurs the most earthquakes"})
print(response6['output'])

The Swiss canton that experiences the most earthquakes is Elm GL with a total of 42 earthquakes.


Add new tool

In [38]:
import geopandas as gpd
from shapely.geometry import Point
from langchain.tools import Tool, StructuredTool
from langchain.agents import initialize_agent, AgentType
from pydantic import BaseModel

# Load the Swiss cantons Shapefile
# Ensure you provide the correct path to your Shapefile
shapefile_path = "shp/swissBOUNDARIES3D_1_5_TLM_KANTONSGEBIET.shp"
swiss_cantons = gpd.read_file(shapefile_path)

# Define input schema for tool
class GetCanton(BaseModel):
    Latitude: list[float]  # List of latitude values
    Longitude: list[float]  # List of longitude values

def assign_canton(data: GetCanton):
    """Assigns Swiss cantons to earthquake locations based on latitude & longitude."""
    df = pd.DataFrame({"Latitude": data.Latitude, "Longitude": data.Longitude})  # Ensure DataFrame format

    # Convert earthquake data into GeoDataFrame
    geometry = [Point(lon, lat) for lat, lon in zip(df["Longitude"], df["Latitude"])]
    earthquakes_gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=swiss_cantons.crs)

    # Perform spatial join with canton boundaries
    merged = gpd.sjoin(earthquakes_gdf, swiss_cantons, how="left", predicate="intersects")

    # Count earthquakes per canton
    canton_counts = merged["canton_name"].value_counts()

    if canton_counts.empty:
        return "No earthquakes found in any canton."

    return f"The Swiss canton with the most earthquakes is {canton_counts.idxmax()} with {canton_counts.max()} occurrences."

# Define the tool for LangChain
canton_tool = StructuredTool.from_function(
    name="CantonLocator",
    func=assign_canton,
    description="Finds the Swiss canton where most earthquakes occur. Takes lists of latitude and longitude.",
    args_schema=GetCanton
)

In [39]:
tools.append(canton_tool)

In [40]:
tools

[QuerySQLDatabaseTool(description="Input to this tool is a detailed and correct SQL query, output is a result from the database. If the query is not correct, an error message will be returned. If an error is returned, rewrite the query, check the query, and try again. If you encounter an issue with Unknown column 'xxxx' in 'field list', use sql_db_schema to query the correct table fields.", db=<langchain_community.utilities.sql_database.SQLDatabase object at 0x14a25f880>),
 InfoSQLDatabaseTool(description='Input to this tool is a comma-separated list of tables, output is the schema and sample rows for those tables. Be sure that the tables actually exist by calling sql_db_list_tables first! Example Input: table1, table2, table3', db=<langchain_community.utilities.sql_database.SQLDatabase object at 0x14a25f880>),
 ListSQLDatabaseTool(db=<langchain_community.utilities.sql_database.SQLDatabase object at 0x14a25f880>),
 QuerySQLCheckerTool(description='Use this tool to double check if your 

In [41]:
# Create LangChain SQL agent
sql_agent = create_openai_tools_agent(
    llm=chat,
    tools=tools,
    prompt=prompt,
)

# Create agent executor
agent_executor = AgentExecutor(agent=sql_agent,
                               tools=tools,
                               verbose=True)

# --- RUN THE LLM AGENT ---
response7 = agent_executor.invoke({"input": "Which Swiss canton has the most earthquakes?"})
print(response7['output'])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `sql_db_list_tables` with `{}`
responded: To answer this question, we first need to access the database that contains the earthquake data. We need to find the table that contains the earthquake data, then we can query the data to find out which Swiss canton has the most earthquakes. Let's start by listing all the tables in the database.

[0m[38;5;200m[1;3mearthquakes[0m[32;1m[1;3m
Invoking: `sql_db_schema` with `{'table_names': 'earthquakes'}`
responded: Great, we have found the 'earthquakes' table which likely contains the data we need. Now, let's check the schema of this table to understand its structure and the data it contains.

[0m[33;1m[1;3m
CREATE TABLE earthquakes (
	"Index" BIGINT, 
	"Local_Time" TIMESTAMP WITHOUT TIME ZONE, 
	"Magnitude" DOUBLE PRECISION, 
	"Location" TEXT, 
	"Latitude" DOUBLE PRECISION, 
	"Longitude" DOUBLE PRECISION
)

/*
3 rows from earthquakes table:
Index	Local_Time	Magnitud

RemoteProtocolError: peer closed connection without sending complete message body (incomplete chunked read)

In [61]:
from typing import Any

from langchain_core.messages import ToolMessage
from langchain_core.runnables import RunnableLambda, RunnableWithFallbacks
from langgraph.prebuilt import ToolNode


def create_tool_node_with_fallback(tools: list) -> RunnableWithFallbacks[Any, dict]:
    """
    Create a ToolNode with a fallback to handle errors and surface them to the agent.
    """
    return ToolNode(tools).with_fallbacks(
        [RunnableLambda(handle_tool_error)], exception_key="error"
    )


def handle_tool_error(state) -> dict:
    error = state.get("error")
    tool_calls = state["messages"][-1].tool_calls
    return {
        "messages": [
            ToolMessage(
                content=f"Error: {repr(error)}\n please fix your mistakes.",
                tool_call_id=tc["id"],
            )
            for tc in tool_calls
        ]
    }

ModuleNotFoundError: No module named 'langgraph'