In [11]:
import os
import json
import tqdm
import pandas as pd
from typing import List
from dotenv import load_dotenv
from sqlalchemy import create_engine
from pydantic import BaseModel, ValidationError
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticOutputParser
import time

# -------------------------
# 1️⃣ Define Pydantic schema
# -------------------------
class ColumnSchema(BaseModel):
    column_name: str
    description: str
    datatype: str
    sample_values: List[str]

class TableSchema(BaseModel):
    table_description: str
    columns: List[ColumnSchema]

# -------------------------
# 2️⃣ Datatype normalization
# -------------------------
def normalize_dtype(dtype: str) -> str:
    dtype = dtype.lower()
    if "int" in dtype:
        return "INTEGER"
    if "float" in dtype or "double" in dtype:
        return "FLOAT"
    if "datetime" in dtype or "date" in dtype:
        return "TIMESTAMP"
    return "STRING"

# -------------------------
# 3️⃣ DB connection
# -------------------------
load_dotenv()
engine = create_engine(
    f"mysql+mysqlconnector://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}"
    f"@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"
)

try:
    with engine.connect() as conn:
        print("MySQL connection successful!")
except Exception as e:
    print("Connection failed:", e)

# -------------------------
# 4️⃣ Read SQL sample safely
# -------------------------
def read_sql_sample(table, n=5):
    query = f"SELECT * FROM {table} LIMIT {n};"  # safe and fast
    df = pd.read_sql(query, con=engine)
    sample_json = df.to_dict(orient="records")
    column_types = {col: normalize_dtype(str(dtype)) for col, dtype in zip(df.columns, df.dtypes)}
    return df, sample_json, column_types

# -------------------------
# 5️⃣ LLM + parser setup
# -------------------------
model = ChatOpenAI(
    model="gpt-5.2",
    temperature=0.2,
    max_tokens=1000,
    timeout=30
)

parser = PydanticOutputParser(pydantic_object=TableSchema)
format_instructions = parser.get_format_instructions()

template = ChatPromptTemplate.from_messages([
    ("system", """
You are an intelligent data annotator. Annotate SQL tables with detailed column-level descriptions.

Rules:
- Output MUST be valid JSON and follow this schema EXACTLY:
{{format_instructions}}

- Do NOT use 'name', 'type', or 'examples'
- All values in 'sample_values' must be strings
- Do NOT add markdown, commentary, or extra text
- Use the exact keys: table_description, columns, column_name, datatype, sample_values
"""),
    ("human", """
SQL table description:
{description}

Columns in the table:
{column_names}

Column data types:
{column_types}

Sample rows:
{data_sample}
""")
])

chain = template | model | parser

# -------------------------
# 6️⃣ Retry logic
# -------------------------
def invoke_with_retry(payload, retries=2, delay=1):
    for attempt in range(retries):
        try:
            result = chain.invoke(payload)
            return result  # ✅ success
        except (ValidationError, Exception) as e:
            print(f"Attempt {attempt+1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                raise e

# -------------------------
# 7️⃣ Knowledge base loop
# -------------------------
kb_final = {}

# Example table_description dict (replace with your actual table descriptions)
table_description = {
    'customer': "Contains customer information.",
    'orders': "Contains order information.",
    'order_items': "Contains item-level order information.",
    'order_payments': "Contains payment details.",
    'order_reviews': "Contains customer reviews.",
    'products': "Contains product information.",
    'product_category_translation': "Maps product categories from Portuguese to English.",
    'sellers': "Contains seller information."
}

for table_name, description in tqdm.tqdm(table_description.items()):
    # skip cached tables
    if table_name in kb_final:
        continue

    df, sample_json, column_types = read_sql_sample(table_name, n=5)

    # Ensure all sample values are strings
    for row in sample_json[:2]:  # only first 1-2 rows
        for k, v in row.items():
            row[k] = str(v)

    payload = {
        "description": description,
        "column_names": list(column_types.keys()),
        "column_types": json.dumps(column_types, ensure_ascii=False),
        "data_sample": json.dumps(sample_json[:2], ensure_ascii=False)  # 1-2 rows
    }

    try:
        result = invoke_with_retry(payload)
        kb_final[table_name] = result.dict()  # store as dict
    except Exception as e:
        print(f"Failed for table: {table_name}")
        kb_final[table_name] = None

# -------------------------
# 8️⃣ Save KB to disk
# -------------------------
with open("kb_final.json", "w", encoding="utf-8") as f:
    json.dump(kb_final, f, indent=2, ensure_ascii=False)

print("Knowledge base generation complete!")


MySQL connection successful!


  0%|                                                     | 0/8 [00:00<?, ?it/s]

Attempt 1 failed: Failed to parse TableSchema from completion {"table_description": "Contains customer identity and location reference data, including internal and unique customer identifiers plus basic geographic attributes such as ZIP code prefix, city, and state.", "columns": [{"column_name": "customer_id", "datatype": "STRING", "sample_values": ["06b8999e2fba1a1fbc88172c00ba8bc7", "18955e83d337fd6b2def6b18a428ac77"]}, {"column_name": "customer_unique_id", "datatype": "STRING", "sample_values": ["861eff4711a542e4b93843c6dd7febb0", "290c77bc529b7ac935b93aa66c333dc3"]}, {"column_name": "customer_zip_code_prefix", "datatype": "INTEGER", "sample_values": ["14409", "9790"]}, {"column_name": "customer_city", "datatype": "STRING", "sample_values": ["franca", "sao bernardo do campo"]}, {"column_name": "customer_state", "datatype": "STRING", "sample_values": ["SP", "SP"]}]}. Got: 5 validation errors for TableSchema
columns.0.description
  Field required [type=missing, input_value={'column_na

 12%|█████▋                                       | 1/8 [00:08<01:00,  8.68s/it]

Attempt 2 failed: Failed to parse TableSchema from completion {"table_description": "Stores customer identity and location details, including identifiers and address-related fields used for shipping, segmentation, and geographic analysis.", "columns": [{"column_name": "customer_id", "datatype": "STRING", "sample_values": ["06b8999e2fba1a1fbc88172c00ba8bc7", "18955e83d337fd6b2def6b18a428ac77"]}, {"column_name": "customer_unique_id", "datatype": "STRING", "sample_values": ["861eff4711a542e4b93843c6dd7febb0", "290c77bc529b7ac935b93aa66c333dc3"]}, {"column_name": "customer_zip_code_prefix", "datatype": "INTEGER", "sample_values": ["14409", "9790"]}, {"column_name": "customer_city", "datatype": "STRING", "sample_values": ["franca", "sao bernardo do campo"]}, {"column_name": "customer_state", "datatype": "STRING", "sample_values": ["SP", "SP"]}]}. Got: 5 validation errors for TableSchema
columns.0.description
  Field required [type=missing, input_value={'column_name': 'customer...7fd6b2def6b

 25%|███████████▎                                 | 2/8 [00:17<00:53,  8.96s/it]

Attempt 2 failed: Failed to parse TableSchema from completion {"table_description": "Stores line-item level details for customer orders, including the product and seller for each item, the deadline for the seller to ship, and the item price and freight (shipping) cost amounts.", "columns": [{"column_name": "order_id", "datatype": "STRING", "sample_values": ["00010242fe8c5a6d1ba2dd792cb16214", "00018f77f2f0320c557190d7a144bdd3"]}, {"column_name": "order_item_id", "datatype": "INTEGER", "sample_values": ["1", "1"]}, {"column_name": "product_id", "datatype": "STRING", "sample_values": ["4244733e06e7ecb4970a6e2683c13e61", "e5f2d52b802189ee658865ca93d83a8f"]}, {"column_name": "seller_id", "datatype": "STRING", "sample_values": ["48436dade18ac8b2bce089ec2a041202", "dd7ddc04e1b6c2c614352b383efe2d36"]}, {"column_name": "shipping_limit_date", "datatype": "STRING", "sample_values": ["2017-09-19 09:45:35", "2017-05-03 11:05:13"]}, {"column_name": "price", "datatype": "FLOAT", "sample_values": ["5

 38%|████████████████▉                            | 3/8 [00:28<00:48,  9.76s/it]

Attempt 2 failed: Failed to parse TableSchema from completion {"table_description": "Item-level details for customer orders, where each row represents a specific product line within an order, including the seller fulfilling the item, the deadline for shipping, and the item price and shipping (freight) cost.", "columns": [{"column_name": "order_id", "datatype": "STRING", "sample_values": ["00010242fe8c5a6d1ba2dd792cb16214", "00018f77f2f0320c557190d7a144bdd3"]}, {"column_name": "order_item_id", "datatype": "INTEGER", "sample_values": ["1", "1"]}, {"column_name": "product_id", "datatype": "STRING", "sample_values": ["4244733e06e7ecb4970a6e2683c13e61", "e5f2d52b802189ee658865ca93d83a8f"]}, {"column_name": "seller_id", "datatype": "STRING", "sample_values": ["48436dade18ac8b2bce089ec2a041202", "dd7ddc04e1b6c2c614352b383efe2d36"]}, {"column_name": "shipping_limit_date", "datatype": "STRING", "sample_values": ["2017-09-19 09:45:35", "2017-05-03 11:05:13"]}, {"column_name": "price", "datatype"

 50%|██████████████████████▌                      | 4/8 [00:36<00:35,  8.94s/it]

Attempt 2 failed: Failed to parse TableSchema from completion {"table_description": "Stores payment records associated with customer orders, including the payment method, sequence of payments for an order, installment count, and the monetary amount paid.", "columns": [{"column_name": "order_id", "datatype": "STRING", "sample_values": ["b81ef226f3fe1789b1e8b2acac839d17", "a9810da82917af2d9aefd1278f1dcfa0"]}, {"column_name": "payment_sequential", "datatype": "INTEGER", "sample_values": ["1", "1"]}, {"column_name": "payment_type", "datatype": "STRING", "sample_values": ["credit_card", "credit_card"]}, {"column_name": "payment_installments", "datatype": "INTEGER", "sample_values": ["8", "1"]}, {"column_name": "payment_value", "datatype": "FLOAT", "sample_values": ["99.33", "24.39"]}]}. Got: 5 validation errors for TableSchema
columns.0.description
  Field required [type=missing, input_value={'column_name': 'order_id...7af2d9aefd1278f1dcfa0']}, input_type=dict]
    For further information v

/var/folders/nk/f8v02jbn1899p80jhdggf92m0000gq/T/ipykernel_7176/2056146298.py:161: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  kb_final[table_name] = result.dict()  # store as dict
 62%|████████████████████████████▏                | 5/8 [00:46<00:28,  9.45s/it]

Attempt 1 failed: Failed to parse TableSchema from completion {"table_description": "Contains product-level master data, including category, text length metrics for the product name and description, photo count, and physical package measurements (weight and dimensions).", "columns": [{"column_name": "product_id", "datatype": "STRING", "sample_values": ["1e9e8ef04dbcff4541ed26657ea517e5", "3aa071139cb16b67ca9e5dea641aaa2f"], "column_description": "Unique identifier for a product, typically represented as a hashed or UUID-like string used to join with other product-related tables."}, {"column_name": "product_category_name", "datatype": "STRING", "sample_values": ["perfumaria", "artes"], "column_description": "Product category label describing the type of product (e.g., perfumery, arts), used for grouping, filtering, and category-level analysis."}, {"column_name": "product_name_lenght", "datatype": "FLOAT", "sample_values": ["40.0", "44.0"], "column_description": "Length of the product na

 75%|█████████████████████████████████▊           | 6/8 [01:03<00:24, 12.07s/it]

Attempt 2 failed: Failed to parse TableSchema from completion {"table_description": "Contains core product catalog attributes, including category, text-length metadata for name and description, number of product photos, and physical dimensions/weight used for logistics and shipping calculations.", "columns": [{"column_name": "product_id", "datatype": "STRING", "sample_values": ["1e9e8ef04dbcff4541ed26657ea517e5", "3aa071139cb16b67ca9e5dea641aaa2f"], "column_description": "Unique identifier for a product in the catalog, typically a hashed/string key used to join with orders, listings, and inventory tables."}, {"column_name": "product_category_name", "datatype": "STRING", "sample_values": ["perfumaria", "artes"], "column_description": "Product category label (often in Portuguese) indicating the catalog grouping for the item (e.g., perfume, arts), used for navigation, reporting, and category-based analysis."}, {"column_name": "product_name_lenght", "datatype": "FLOAT", "sample_values": ["

 88%|███████████████████████████████████████▍     | 7/8 [01:09<00:10, 10.07s/it]

Attempt 2 failed: Failed to parse TableSchema from completion {"table_description": "Lookup table that maps product category identifiers in Portuguese to their corresponding standardized category identifiers in English for consistent reporting, analysis, and localization.", "columns": [{"column_name": "product_category_name", "datatype": "STRING", "sample_values": ["beleza_saude", "informatica_acessorios"]}, {"column_name": "product_category_name_english", "datatype": "STRING", "sample_values": ["health_beauty", "computers_accessories"]}]}. Got: 2 validation errors for TableSchema
columns.0.description
  Field required [type=missing, input_value={'column_name': 'product_...nformatica_acessorios']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
columns.1.description
  Field required [type=missing, input_value={'column_name': 'product_...computers_accessories']}, input_type=dict]
    For further information visit https://errors.pydantic.dev

100%|█████████████████████████████████████████████| 8/8 [01:17<00:00,  9.72s/it]

Attempt 2 failed: Failed to parse TableSchema from completion {"table_description": "Contains seller information, including a unique seller identifier and the seller's location details such as ZIP code prefix, city, and state.", "columns": [{"column_name": "seller_id", "datatype": "STRING", "sample_values": ["3442f8959a84dea7ee197c632cb2df15", "d1b65fc7debc3361ea86b5f14c68d2e2"]}, {"column_name": "seller_zip_code_prefix", "datatype": "INTEGER", "sample_values": ["13023", "13844"]}, {"column_name": "seller_city", "datatype": "STRING", "sample_values": ["campinas", "mogi guacu"]}, {"column_name": "seller_state", "datatype": "STRING", "sample_values": ["SP", "SP"]}]}. Got: 4 validation errors for TableSchema
columns.0.description
  Field required [type=missing, input_value={'column_name': 'seller_i...c3361ea86b5f14c68d2e2']}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.12/v/missing
columns.1.description
  Field required [type=missing, input_value={'col




In [2]:
class ColumnSchema(BaseModel):
    column_name: str
    description: str
    datatype: str
    sample_values: List[str]

class TableSchema(BaseModel):
    table_description: str
    columns: List[ColumnSchema]

In [3]:
table_description = {

'customer' : '''Contains customer information.
Columns:
- customer_id: unique identifier for each customer
- customer_unique_id: another unique identifier for each customer
- customer_zip_code_prefix: postal code prefix of the customer
- customer_city: city of the customer
- customer_state: state of the customer''',

'orders'  : '''Contains order information.
Columns:
- order_id: unique identifier for each order
- order_item_id: sequential number of items in the order
- product_id: foreign key referencing products.product_id
- seller_id: foreign key referencing sellers.seller_id
- shipping_limit_date: latest date the seller can hand over the item to the logistics partner
- price: price of the item in the order
- freight_value: freight cost for the item in the order''',

'order_items'  : '''Contains item-level information for each order.
Columns:
- order_id: foreign key referencing orders.order_id
- order_item_id: sequential number of the item in the order
- product_id: foreign key referencing products.product_id
- seller_id: foreign key referencing sellers.seller_id
- shipping_limit_date: latest date for the seller to hand over the item to logistics
- price: price of the item
- freight_value: freight cost of the item''',

'order_payments'  : '''Contains payment details for each order.
Columns:
- order_id: foreign key referencing orders.order_id
- payment_sequential: sequence number of the payment for an order, used if the customer pays with multiple methods
- payment_type: method of payment used
- payment_installments: number of installments chosen for this payment
- payment_value: value of this specific payment transaction''',

'order_reviews'  : '''Contains customer reviews of orders.
Columns:
- review_id: unique identifier for each review
- order_id: foreign key referencing orders.order_id
- review_score: numeric score given by the customer
- review_comment_title: short title or summary of the review provided by the customer
- review_comment_message: full text comment provided by the customer
- review_creation_date: date when the review was submitted by the customer
- review_answer_timestamp: date when the seller or platform responded to the review (if any)''',

'products'  : '''Contains product information.
Columns:
- product_id: unique identifier for each product
- product_category_name: category of the product in Brazilian Portuguese
- product_name_lenght: length of the product name in characters
- product_description_lenght: length of the product description in characters
- product_photos_qty: number of photos available for the product
- product_weight_g: weight of the product in grams
- product_length_cm: product length in centimeters
- product_height_cm: product height in centimeters
- product_width_cm: product width in centimeters''',

'product_category_translation' : '''Maps product categories from Brazilian Portuguese to English.
Columns:
- product_category_name: category name in Portuguese
- product_category_name_english: translated category name''',

'sellers'  : '''Contains seller information.
Columns:
- seller_id: unique identifier for each seller
- seller_zip_code_prefix: postal code prefix of the seller's location
- seller_city: city of the seller
- seller_state: state of the seller''',

}

In [4]:
# Load environment variables from .env file
load_dotenv()

# Read credentials from environment variables
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
database = os.getenv("DB_NAME")

# Create SQLAlchemy engine
engine = create_engine(
    f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{database}"
)

try:
    with engine.connect() as conn:
        print("MySQL connection successful!")
except Exception as e:
    print("Connection failed:", e)


def read_sql_sample(table, n=5):
    """
    Returns:
    - df_sample: Pandas DataFrame with n random rows
    - sample_json: list of dicts (JSON style)
    - column_types: dict {column_name: datatype}
    """
    query = f"SELECT * FROM {table} LIMIT {n};"
    df = pd.read_sql(query, con=engine)

    sample_json = df.to_dict(orient="records")
    column_types = {
        col: str(dtype)
        for col, dtype in zip(df.columns, df.dtypes)
    }

    return df, sample_json, column_types

MySQL connection successful!


In [5]:
model = ChatOpenAI(
    model="gpt-5.2",
    temperature=0.2,
    max_tokens=1000,
    timeout=30
)

parser = PydanticOutputParser(pydantic_object=TableSchema)
format_instructions = parser.get_format_instructions()

template = ChatPromptTemplate.from_messages([
("system", """
You are an intelligent data annotator. Annotate SQL tables with detailed column-level descriptions.
Use the sample data and column types provided to generate JSON output.
Rules:
- Do NOT invent columns
- Use ONLY the columns provided
- Output MUST be valid JSON
- No markdown, no explanations

{{format_instructions}}
"""),
("human", """
SQL table description:
{description}

Sample rows from the table:
{data_sample}

Column data types:
{column_types}
""")
])


# chain = (
#     RunnableMap({
#         "description": lambda x: x["description"],
#         "data_sample": lambda x: x["data_sample"],
#         "column_types": lambda x: x["column_types"]
#     })
#     | template
#     | model
#     | StrOutputParser()
# )

chain = template | model | parser

In [6]:
def invoke_with_retry(payload, retries=2):
    for attempt in range(retries):
        try:
            return chain.invoke(payload)
        except (ValidationError, Exception) as e:
            if attempt == retries - 1:
                raise e


In [7]:
kb_final = {}

for table_name, description in tqdm.tqdm(table_description.items()):
    if table_name in kb_final:
        continue

    df, sample_json, column_types = read_sql_sample(table_name, n=5)

    payload = {
        "description": description,
        "column_names": list(column_types.keys()),
        "column_types": json.dumps(column_types, ensure_ascii=False),
        "data_sample": json.dumps(sample_json[:2], ensure_ascii=False)
    }

    try:
        kb_final[table_name] = invoke_with_retry(payload).dict()
    except Exception as e:
        print(f"Failed for table: {table_name}")
        print(e)
        kb_final[table_name] = None


 12%|█████▋                                       | 1/8 [00:10<01:13, 10.48s/it]

Failed for table: customer
Failed to parse TableSchema from completion {"table_name": "customers", "table_description": "Contains customer information.", "columns": [{"column_name": "customer_id", "data_type": "object", "description": "Primary customer identifier used to represent a customer record in the dataset; appears to be a hashed/opaque string and is unique per row.", "sample_values": ["06b8999e2fba1a1fbc88172c00ba8bc7", "18955e83d337fd6b2def6b18a428ac77"]}, {"column_name": "customer_unique_id", "data_type": "object", "description": "Alternate unique identifier for the customer, likely representing the underlying person/entity across records; stored as an opaque string.", "sample_values": ["861eff4711a542e4b93843c6dd7febb0", "290c77bc529b7ac935b93aa66c333dc3"]}, {"column_name": "customer_zip_code_prefix", "data_type": "int64", "description": "Numeric postal code prefix for the customer's address location (e.g., first digits of the ZIP/postal code) used for coarse geographic grou

 25%|███████████▎                                 | 2/8 [00:28<01:30, 15.12s/it]

Failed for table: orders
Failed to parse TableSchema from completion {"table_name": "order_items", "table_description": "Line-item level order information, with one row per product sold within an order, including seller, shipping deadline, item price, and freight cost.", "columns": [{"name": "order_id", "type": "object", "description": "Unique identifier for the order that this line item belongs to; joins to the orders table primary key.", "semantic_type": "identifier", "examples": ["00010242fe8c5a6d1ba2dd792cb16214", "00018f77f2f0320c557190d7a144bdd3"], "notes": "Typically a non-human-readable string/UUID-like key. Not necessarily unique in this table because an order can have multiple items."}, {"name": "order_item_id", "type": "int64", "description": "Sequential line-item number within an order (e.g., 1 for the first item, 2 for the second).", "semantic_type": "sequence_number", "examples": [1], "notes": "Uniqueness is expected when combined with order_id (composite key order_id + o

 38%|████████████████▉                            | 3/8 [00:45<01:19, 15.94s/it]

Failed for table: order_items
Failed to parse TableSchema from completion {"table_name": "order_items", "table_description": "Contains item-level information for each order, including product, seller, shipping deadline, item price, and freight cost.", "columns": [{"name": "order_id", "type": "object", "description": "Unique identifier of the order this item belongs to; foreign key referencing orders.order_id.", "semantic_type": "identifier", "is_nullable": false, "is_primary_key": false, "is_foreign_key": true, "references": {"table": "orders", "column": "order_id"}, "examples": ["00010242fe8c5a6d1ba2dd792cb16214", "00018f77f2f0320c557190d7a144bdd3"]}, {"name": "order_item_id", "type": "int64", "description": "Sequential number of the item within the order (e.g., 1 for the first item in the order).", "semantic_type": "sequence_number", "is_nullable": false, "is_primary_key": false, "is_foreign_key": false, "examples": [1]}, {"name": "product_id", "type": "object", "description": "Uniqu

 50%|██████████████████████▌                      | 4/8 [01:00<01:02, 15.55s/it]

Failed for table: order_payments
Failed to parse TableSchema from completion {"table_name": "order_payments", "table_description": "Contains payment details for each order, including potential multiple payment records per order when a customer uses more than one payment method or transaction.", "columns": [{"name": "order_id", "type": "object", "description": "Unique identifier of the order this payment record belongs to; foreign key referencing orders.order_id. Multiple rows may share the same order_id when an order is paid via multiple payments.", "constraints_and_notes": {"key_relationships": "Foreign key to orders.order_id", "nullability": "Expected non-null", "example_values": ["b81ef226f3fe1789b1e8b2acac839d17", "a9810da82917af2d9aefd1278f1dcfa0"]}}, {"name": "payment_sequential", "type": "int64", "description": "Sequence number of the payment within an order, starting at 1, used to order multiple payment records for the same order_id (e.g., split payments across methods or trans

 62%|████████████████████████████▏                | 5/8 [01:17<00:48, 16.06s/it]

Failed for table: order_reviews
Failed to parse TableSchema from completion {"table_name": "order_reviews", "table_description": "Contains customer reviews of orders, including rating score, optional textual feedback, submission date, and any response timestamp.", "columns": [{"column_name": "review_id", "data_type": "object", "description": "Unique identifier for each review record (typically a hashed/string ID). Primary key for the reviews table.", "sample_values": ["7bc2406110b926393aa56f80a40eba40", "80e641a11e56f04c1ad469d5645fdfde"], "nullable": false}, {"column_name": "order_id", "data_type": "object", "description": "Identifier of the order being reviewed; foreign key referencing orders.order_id. Links each review to the corresponding purchase/order.", "sample_values": ["73fc7af87114b39712e6da79b0a377eb", "a548910a1c6147796b98fdf73dbeba33"], "nullable": false}, {"column_name": "review_score", "data_type": "int64", "description": "Numeric rating score provided by the customer fo

 75%|█████████████████████████████████▊           | 6/8 [01:33<00:32, 16.14s/it]

Failed for table: products
Failed to parse TableSchema from completion {"table_name": "products", "table_description": "Contains product information, including category, text field lengths, photo count, weight, and physical dimensions.", "columns": [{"name": "product_id", "type": "object", "description": "Unique identifier for each product record (alphanumeric string, typically a hashed/UUID-like value).", "semantic_type": "identifier", "nullable": false, "examples": ["1e9e8ef04dbcff4541ed26657ea517e5", "3aa071139cb16b67ca9e5dea641aaa2f"]}, {"name": "product_category_name", "type": "object", "description": "Product category name in Brazilian Portuguese.", "semantic_type": "category", "nullable": true, "examples": ["perfumaria", "artes"]}, {"name": "product_name_lenght", "type": "float64", "description": "Length of the product name text in characters (stored as a numeric value).", "semantic_type": "text_length", "nullable": true, "unit": "characters", "examples": [40.0, 44.0]}, {"name":

 88%|███████████████████████████████████████▍     | 7/8 [01:40<00:12, 12.88s/it]

Failed for table: product_category_translation
Failed to parse TableSchema from completion {"table": "product_category_name_translation", "description": "Lookup table that maps product category names from Brazilian Portuguese to their English equivalents for normalization, reporting, and cross-language analytics.", "columns": [{"name": "product_category_name", "type": "object", "description": "Product category name in Brazilian Portuguese, typically formatted as lowercase tokens separated by underscores (e.g., \"beleza_saude\"). Acts as the source-language key for the translation mapping.", "examples": ["beleza_saude", "informatica_acessorios"]}, {"name": "product_category_name_english", "type": "object", "description": "English translation of the corresponding Brazilian Portuguese product category name, typically formatted as lowercase tokens separated by underscores (e.g., \"health_beauty\"). Used as the target-language label for the category.", "examples": ["health_beauty", "compute

100%|█████████████████████████████████████████████| 8/8 [01:51<00:00, 13.95s/it]

Failed for table: sellers
Failed to parse TableSchema from completion {"table_name": "sellers", "table_description": "Contains seller information.", "columns": [{"column_name": "seller_id", "data_type": "object", "description": "Unique identifier for each seller; appears to be a hashed/string ID used to reference the seller across datasets.", "examples": ["3442f8959a84dea7ee197c632cb2df15", "d1b65fc7debc3361ea86b5f14c68d2e2"]}, {"column_name": "seller_zip_code_prefix", "data_type": "int64", "description": "Numeric postal code prefix indicating the seller's location (e.g., leading digits of the full ZIP/postal code).", "examples": [13023, 13844]}, {"column_name": "seller_city", "data_type": "object", "description": "City name where the seller is located, stored as lowercase text in the sample.", "examples": ["campinas", "mogi guacu"]}, {"column_name": "seller_state", "data_type": "object", "description": "State/region abbreviation for the seller's location (e.g., Brazilian state code)."




In [8]:
kb_final

{'customer': None,
 'orders': None,
 'order_items': None,
 'order_payments': None,
 'order_reviews': None,
 'products': None,
 'product_category_translation': None,
 'sellers': None}

In [9]:
with open('kb.pkl', 'wb') as f:
    pickle.dump(kb_final, f)

NameError: name 'pickle' is not defined