In [27]:
import os
import getpass
from sqlalchemy import create_engine, text
from langchain.chat_models import init_chat_model
from langchain.agents import Tool, AgentExecutor
from langchain.prompts import PromptTemplate

In [28]:
linkml_schema= """id: chinook_schema
name: Chinook
description: "LinkML schema for the Chinook database, extracted from ChinookData.json"
default_prefix: chinook
prefixes:
  chinook: "http://example.org/chinook/"
  linkml: "https://w3id.org/linkml/"

types:
  string:
    base: str
  integer:
    base: int
  number:
    base: float

enums:
  # No enums identified from the data provided

classes:
  Genre:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - genre_id
      - name
  MediaType:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - media_type_id
      - name
  Artist:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - artist_id
      - name
  Album:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - album_id
      - title
      - artist_id
  Track:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - track_id
      - name
      - composer
      - milliseconds
      - bytes
      - unit_price
      - album_id
      - media_type_id
      - genre_id
  Employee:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - employee_id
      - last_name
      - first_name
      - title
      - birth_date
      - hire_date
      - address
      - city
      - state
      - country
      - postal_code
      - phone
      - fax
      - email
      - reports_to
  Customer:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - customer_id
      - company
      - last_name
      - first_name
      - email
      - phone
      - address
      - city
      - state
      - country
      - postal_code
      - fax
      - support_rep_id
  Invoice:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - invoice_id
      - customer_id
      - invoice_date
      - billing_address
      - billing_city
      - billing_state
      - billing_country
      - billing_postal_code
      - total
  InvoiceLine:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - invoice_line_id
      - invoice_id
      - track_id
      - unit_price
      - quantity
  Playlist:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - playlist_id
      - name
  PlaylistTrack:
    description: "Extrahiert aus: ChinookData.json"
    slots:
      - playlist_id
      - track_id

slots:
  genre_id:
    description: "Extrahiert aus: ChinookData.json"
    identifier: true
    range: integer
  name:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  media_type_id:
    description: "Extrahiert aus: ChinookData.json"
    identifier: true
    range: integer
  artist_id:
    description: "Extrahiert aus: ChinookData.json"
    identifier: true
    range: integer
  album_id:
    description: "Extrahiert aus: ChinookData.json"
    identifier: true
    range: integer
  title:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  track_id:
    description: "Extrahiert aus: ChinookData.json"
    identifier: true
    range: integer
  composer:
    description: "Extrahiert aus: ChinookData.json"
    range: string
    required: false
  milliseconds:
    description: "Extrahiert aus: ChinookData.json"
    range: integer
  bytes:
    description: "Extrahiert aus: ChinookData.json"
    range: integer
  unit_price:
    description: "Extrahiert aus: ChinookData.json"
    range: number
  employee_id:
    description: "Extrahiert aus: ChinookData.json"
    identifier: true
    range: integer
  last_name:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  first_name:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  title:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  birth_date:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  hire_date:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  address:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  city:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  state:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  country:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  postal_code:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  phone:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  fax:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  email:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  reports_to:
    description: "Extrahiert aus: ChinookData.json"
    range: integer
    required: false
  customer_id:
    description: "Extrahiert aus: ChinookData.json"
    identifier: true
    range: integer
  company:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  support_rep_id:
    description: "Extrahiert aus: ChinookData.json"
    range: integer
  invoice_id:
    description: "Extrahiert aus: ChinookData.json"
    identifier: true
    range: integer
  invoice_date:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  billing_address:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  billing_city:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  billing_state:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  billing_country:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  billing_postal_code:
    description: "Extrahiert aus: ChinookData.json"
    range: string
  total:
    description: "Extrahiert aus: ChinookData.json"
    range: number
  invoice_line_id:
    description: "Extrahiert aus: ChinookData.json"
    identifier: true
    range: integer
  track_id:
    description: "Extrahiert aus: ChinookData.json"
    range: integer
  quantity:
    description: "Extrahiert aus: ChinookData.json"
    range: integer
  playlist_id:
    description: "Extrahiert aus: ChinookData.json"
    identifier: true
    range: integer"""



In [29]:
system_prompt = PromptTemplate.from_template("""
You are an experienced Data Engineer who exclusively uses SQL to load CSV files into a PostgreSQL database.

You are only allowed to use SQL statements executed through the SQL tool. The SQL tool can run SQL commands in the database and return their results.

**Behave according to the following rules:**

1. Start the entire process with `BEGIN;` to open a transaction.
2. Use PostgreSQL’s `COPY FROM` command to load CSV files from the `temp/` directory.
3. Use `SELECT` statements to check if tables already exist.
4. If a table does not exist, create it using `CREATE TABLE`, based on the LinkML schema, including all columns, data types, and foreign key relationships.
5. Strictly maintain referential integrity:
   - Load parent tables first, then child tables.
   - Avoid inserting rows with invalid foreign key references.
6. Avoid inserting duplicate records:
   - Use techniques like `INSERT INTO ... SELECT ... WHERE NOT EXISTS (...)` or temporary tables.
7. Update the `domain_knowledge` table with the name and a meaningful description of each table you create or modify.
8. Once **all steps are successfully completed and the data integrity has been validated**, execute `COMMIT;`.
9. If an error occurs during the process or you detect inconsistencies, execute `ROLLBACK;` instead.
10. Never return full tables as output – show a maximum of 5 rows per query as a preview.
11. **After each table is successfully loaded and validated**, insert a record into the `domain_knowledge` table using the following format:
```sql
INSERT INTO domain_knowledge (Table_Name, Description)
VALUES (
  'name_of_table',
  'This table stores ... (a detailed description including the meaning of columns, primary/foreign keys, the role of this table in the data model, relationships to other tables, cardinalities, etc.).'
);


**Important:** Use only PostGRESQL and the SQL tool – no other tools or programming languages are allowed.

Here is the LinkML schema of your data model, describing all tables, columns, and relationships:

{linkml_schema}

Now begin planning and executing your steps.
""")


In [30]:
import psycopg2
from sqlalchemy.pool import StaticPool
from langchain_community.agent_toolkits.sql.toolkit import SQLDatabaseToolkit
from langchain_community.utilities.sql_database import SQLDatabase
from langgraph.prebuilt import create_react_agent

# OpenAI API Key abfragen falls nicht gesetzt
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

# DB-Verbindung
db_uri = "postgresql+psycopg2://langchain_user:supersecurepassword@localhost:5432/langchain_db"
engine = create_engine(db_uri)

def get_engine_for_postgres_db():
    # Connection URL for SQLAlchemy using the psycopg2 PostgreSQL driver
    url = "postgresql+psycopg2://langchain_user:supersecurepassword@localhost:5432/langchain_db"

    # Function that creates a new connection to the PostgreSQL database
    def connect():
        return psycopg2.connect(
            dbname="langchain_db",   
            user="langchain_user",            
            password="supersecurepassword",        
            host="localhost",        
            port=5432               
        )

    # Creates a SQLAlchemy engine with:
    # - the connection URL
    # - a creator function that opens new psycopg2 connections as needed (instead of using connection pooling)
    # - StaticPool, which keeps the same connection open and does not manage a real pool
    engine = create_engine(
        url,
        creator=connect,
        poolclass=StaticPool
    )
    return engine

engine = get_engine_for_postgres_db()

tool_description = "- SQL: Führt SQL Statements in der Datenbank aus und gibt das Ergebnis zurück."

db = SQLDatabase(engine)


llm = init_chat_model("gpt-4o", model_provider="openai")
toolkit = SQLDatabaseToolkit(db=db, llm=llm)

from langgraph.prebuilt import create_react_agent


# LLM initialisieren

system_prompt = system_prompt.format(linkml_schema=linkml_schema)
agent_executor = create_react_agent(llm, toolkit.get_tools(), prompt=system_prompt)

user_message = (
    "Lade mir alle CSV-Dateien aus dem Verzeichnis 'temp' in die Datenbank. "
    "Beachte die Relationen im LinkML-Schema. Prüfe Tabellen, wahre referenzielle Integrität, "
    "vermeide Duplikate, und aktualisiere 'domain_knowledge'."
)

# 5. Agent mit Streaming verwenden
events = agent_executor.stream(
    {"messages": [("user", user_message)]},
    stream_mode="values",
)

# 6. Ausgabe iterieren
for event in events:
    print(event)
    event["messages"][-1].pretty_print()

{'messages': [HumanMessage(content="Lade mir alle CSV-Dateien aus dem Verzeichnis 'temp' in die Datenbank. Beachte die Relationen im LinkML-Schema. Prüfe Tabellen, wahre referenzielle Integrität, vermeide Duplikate, und aktualisiere 'domain_knowledge'.", additional_kwargs={}, response_metadata={}, id='f61d1b21-2da9-4065-a9b0-bab8460f3613')]}

Lade mir alle CSV-Dateien aus dem Verzeichnis 'temp' in die Datenbank. Beachte die Relationen im LinkML-Schema. Prüfe Tabellen, wahre referenzielle Integrität, vermeide Duplikate, und aktualisiere 'domain_knowledge'.
{'messages': [HumanMessage(content="Lade mir alle CSV-Dateien aus dem Verzeichnis 'temp' in die Datenbank. Beachte die Relationen im LinkML-Schema. Prüfe Tabellen, wahre referenzielle Integrität, vermeide Duplikate, und aktualisiere 'domain_knowledge'.", additional_kwargs={}, response_metadata={}, id='f61d1b21-2da9-4065-a9b0-bab8460f3613'), AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_etJytPdcksyNt8V5p9x5BKxh',

KeyboardInterrupt: 