In [1]:
# !pip install -Uq "google-genai==1.7.0"

import duckdb
import pandas as pd
from google import genai
from google.genai import types
from google.api_core import retry
from IPython.display import Markdown, display
print(f"{genai.__version__=}")

# For use on Kaggle
# from kaggle_secrets import UserSecretsClient
# GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

# For use locally
import os
from dotenv import load_dotenv
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
print(len(GOOGLE_API_KEY))

client = genai.Client(api_key=GOOGLE_API_KEY)

is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

if not hasattr(genai.models.Models.generate_content, '__wrapped__'):
  genai.models.Models.generate_content = retry.Retry(    
      predicate=is_retriable)(genai.models.Models.generate_content)

genai.__version__='1.11.0'
39


In [2]:
conn = duckdb.connect(database='../SEC/kaggle.db')


# Get all table names
tables = conn.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'").fetchall()

# Initialize schema output
schema_output = ['DuckDB Database Schema:']

for (table_name,) in tables:
    schema_output.append(f"\nTable Name: '{table_name}'")
    schema = conn.execute(f"PRAGMA table_info('{table_name}')").fetchall()
    schema_output.append("Columns:")
    for column in schema:
        cid, name, dtype, notnull, pk, default = column
        schema_output.append(f"  {name} {dtype}{" NOTNULL," if notnull else ","}")

# Close the connection
# conn.close()

# Return the schema as a string
print("\n".join(schema_output))



DuckDB Database Schema:

Table Name: 'cal'
Columns:
  adsh VARCHAR NOTNULL,
  grp BIGINT NOTNULL,
  arc BIGINT NOTNULL,
  negative BIGINT,
  ptag VARCHAR,
  pversion VARCHAR,
  ctag VARCHAR,
  cversion VARCHAR,

Table Name: 'dim'
Columns:
  dimhash VARCHAR NOTNULL,
  segments VARCHAR,
  segt BIGINT,

Table Name: 'num'
Columns:
  adsh VARCHAR,
  tag VARCHAR,
  version VARCHAR,
  ddate DATE,
  qtrs BIGINT,
  uom VARCHAR,
  dimh VARCHAR,
  iprx BIGINT,
  value DOUBLE,
  footnote VARCHAR,
  footlen BIGINT,
  dimn BIGINT,
  coreg VARCHAR,
  durp DOUBLE,
  datp DOUBLE,
  dcml BIGINT,

Table Name: 'pre'
Columns:
  adsh VARCHAR NOTNULL,
  report BIGINT NOTNULL,
  line BIGINT NOTNULL,
  stmt VARCHAR,
  inpth BIGINT,
  tag VARCHAR,
  version VARCHAR,
  prole VARCHAR,
  plabel VARCHAR,
  negating BIGINT,

Table Name: 'ren'
Columns:
  adsh VARCHAR NOTNULL,
  report BIGINT NOTNULL,
  rfile VARCHAR,
  menucat VARCHAR,
  shortname VARCHAR,
  longname VARCHAR,
  roleuri VARCHAR,
  parentroleuri VARCHA

In [3]:
document_file = client.files.upload(file='fsnds.pdf')
request = 'Which table(s) have an adsh column?'

def summarise_doc(request: str) -> str:
  """Execute the request on the uploaded document."""
  # Set the temperature low to stabilise the output.
  config = types.GenerateContentConfig(temperature=0.0)
  response = client.models.generate_content(
      model='gemini-2.0-flash',
      config=config,
      contents=[request, schema_output],
  )

  return response.text

summary = summarise_doc(request)
Markdown(summary)

Based on the provided schema, the following tables have an `adsh` column:

*   `cal`
*   `num`
*   `pre`
*   `ren`
*   `sub`
*   `txt`

In [None]:
def execute_query(sql: str) -> list[list[str]]:
    """Execute an SQL statement, returning the results."""
    print(f' - DB CALL: execute_query({sql})')

    cursor = conn.cursor()

    cursor.execute(sql)
    return cursor.fetchall()


execute_query("select * from sub")

In [5]:
# These are the Python functions defined above.
db_tools = [execute_query]

instruction = """You are a helpful chatbot that can interact with a DuckDB SQL database. 
You will take the users questions and turn them into SQL queries using the tools available. 
Once you have the information you need, you will answer the user's question using the data returned.

Use execute_query to issue an SQL SELECT query."""


# Start a chat with automatic function calling enabled.
chat = client.chats.create(
    model="gemini-2.0-flash",
    config=types.GenerateContentConfig(
        system_instruction=instruction,
        tools=db_tools,
        contents=[schema_output],
    ),
)

ValidationError: 1 validation error for GenerateContentConfig
contents
  Extra inputs are not permitted [type=extra_forbidden, input_value=[['DuckDB Database Schema...,', '  value VARCHAR,']], input_type=list]
    For further information visit https://errors.pydantic.dev/2.10/v/extra_forbidden

In [None]:
resp = chat.send_message("What is the cheapest product?")
print(f"\n{resp.text}")

In [None]:
document_file = client.files.upload(file='fsnds.pdf')
request = 'How is the sub table connected to the cal table?'

def summarise_doc(request: str) -> str:
  """Execute the request on the uploaded document."""
  # Set the temperature low to stabilise the output.
  config = types.GenerateContentConfig(temperature=0.0)
  response = client.models.generate_content(
      model='gemini-2.0-flash',
      config=config,
      contents=[request, document_file],
  )

  return response.text

summary = summarise_doc(request)
Markdown(summary)

In this example, the model generated a textual justification that was set up in a chat context. This full text response is useful both for human interpretation and for giving the model a place to "collect notes" while it assesses the text and produces a final score. This "note taking" or "thinking" strategy typically works well with auto-regressive models, where the generated text is passed back into the model at each generation step. This means the working "notes" are used when generating final result output.

In the next turn, the model converts the text output into a structured response. If you want to aggregate scores or use them programatically then you want to avoid parsing the unstructured text output. Here the `SummaryRating` schema is passed, so the model converts the chat history into an instance of the `SummaryRating` enum.