In [3]:
# !pip install -Uq "google-genai==1.7.0"

import duckdb
import pandas as pd
from google import genai
from google.genai import types
from google.api_core import retry
from IPython.display import Markdown, display
print(f"{genai.__version__=}")

# For use on Kaggle
# from kaggle_secrets import UserSecretsClient
# GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

# For use locally
import os
from dotenv import load_dotenv
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
print(len(GOOGLE_API_KEY))

client = genai.Client(api_key=GOOGLE_API_KEY)

is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

if not hasattr(genai.models.Models.generate_content, '__wrapped__'):
  genai.models.Models.generate_content = retry.Retry(    
      predicate=is_retriable)(genai.models.Models.generate_content)

genai.__version__='1.11.0'
39


In [None]:
conn = duckdb.connect(database='../SEC/db_all.duckdb')


# Get all table names
tables = conn.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'main'").fetchall()

# Initialize schema output
schema_output = ['Database Schema:']

for (table_name,) in tables:
    # Get the schema for each table
    schema = conn.execute(f"DESCRIBE {table_name}").fetchall()
    schema_output.append(f"\nTable: {table_name}")
    schema_output.append("Columns:")
    for column in schema:
        col_name, col_type, null, key, default, extra = column
        schema_output.append(f"  {col_name} {col_type} {'NOT NULL' if null == 'NO' else 'NULL'}")

# Close the connection
conn.close()

# Return the schema as a string
print("\n".join(schema_output))




Table: cal
Columns:
  adsh VARCHAR NULL
  grp BIGINT NULL
  arc BIGINT NULL
  negative BIGINT NULL
  ptag VARCHAR NULL
  pversion VARCHAR NULL
  ctag VARCHAR NULL
  cversion VARCHAR NULL

Table: dim
Columns:
  dimhash VARCHAR NULL
  segments VARCHAR NULL
  segt BIGINT NULL

Table: num
Columns:
  adsh VARCHAR NULL
  tag VARCHAR NULL
  version VARCHAR NULL
  ddate BIGINT NULL
  qtrs BIGINT NULL
  uom VARCHAR NULL
  dimh VARCHAR NULL
  iprx BIGINT NULL
  value DOUBLE NULL
  footnote VARCHAR NULL
  footlen BIGINT NULL
  dimn BIGINT NULL
  coreg VARCHAR NULL
  durp DOUBLE NULL
  datp DOUBLE NULL
  dcml BIGINT NULL

Table: pre
Columns:
  adsh VARCHAR NULL
  report BIGINT NULL
  line BIGINT NULL
  stmt VARCHAR NULL
  inpth BIGINT NULL
  tag VARCHAR NULL
  version VARCHAR NULL
  prole VARCHAR NULL
  plabel VARCHAR NULL
  negating BIGINT NULL

Table: ren
Columns:
  adsh VARCHAR NULL
  report BIGINT NULL
  rfile VARCHAR NULL
  menucat VARCHAR NULL
  shortname VARCHAR NULL
  longname VARCHAR NUL

In [11]:
conn = duckdb.connect(database='../SEC/db_all.duckdb')
schema_info = conn.execute(f"PRAGMA table_info('{table_name}')").fetchall()
if not schema_info:
  print(f"Table '{table_name}' not found.")
  retur
print(f"Schema for table: {table_name}")
print("-" * (16 + len(table_name)))
print(f"{'Column ID':<10} {'Name':<20} {'Type':<15} {'NotNull':<8} {'PrimaryKey':<12}")
print("-" * 70)
for column in schema_info:
  cid, name, dtype, notnull, pk, default = column
  print(f"{cid:<10} {name:<20} {dtype:<15} {bool(notnull):<8} {bool(pk):<12}")
print("-" * 7)
conn.close()

Schema for table: txt
-------------------
Column ID  Name                 Type            NotNull  PrimaryKey  
----------------------------------------------------------------------
0          adsh                 VARCHAR         0        0           
1          tag                  VARCHAR         0        0           
2          version              VARCHAR         0        0           
3          ddate                BIGINT          0        0           
4          qtrs                 BIGINT          0        0           
5          iprx                 BIGINT          0        0           
6          lang                 VARCHAR         0        0           
7          dcml                 BIGINT          0        0           
8          durp                 DOUBLE          0        0           
9          datp                 DOUBLE          0        0           
10         dimh                 VARCHAR         0        0           
11         dimn                 BIGINT         

In [None]:
document_file = client.files.upload(file='fsnds.pdf')
request = 'How is the sub table connected to the cal table?'

def summarise_doc(request: str) -> str:
  """Execute the request on the uploaded document."""
  # Set the temperature low to stabilise the output.
  config = types.GenerateContentConfig(temperature=0.0)
  response = client.models.generate_content(
      model='gemini-2.0-flash',
      config=config,
      contents=[request, document_file],
  )

  return response.text

summary = summarise_doc(request)
Markdown(summary)

In this example, the model generated a textual justification that was set up in a chat context. This full text response is useful both for human interpretation and for giving the model a place to "collect notes" while it assesses the text and produces a final score. This "note taking" or "thinking" strategy typically works well with auto-regressive models, where the generated text is passed back into the model at each generation step. This means the working "notes" are used when generating final result output.

In the next turn, the model converts the text output into a structured response. If you want to aggregate scores or use them programatically then you want to avoid parsing the unstructured text output. Here the `SummaryRating` schema is passed, so the model converts the chat history into an instance of the `SummaryRating` enum.