In [1]:
import os
import re
from typing import Dict, List
from model.load_model import description_agent
import importlib


async def call_llm_agent(prompt: str) -> str:
    """
    Use the pydantic_ai agent to process the prompt and return a description.
    """
    try:
        # Run the agent with the prompt
        result = await description_agent.run(prompt)
        return result.data
    except Exception as e:
        print(f"Error calling LLM agent: {str(e)}")
        # Fallback to mock implementation
        table_name = prompt.split("Table: ")[1].split(",")[0]
        return f"This table stores and manages {table_name} related data with associated metadata and relationships."

async def generate_table_description(table_name: str, columns: List[Dict], sample_data: List[Dict]) -> str:
    """
    Generate a single description for the entire table using LLM agent.
    """
    # Create column info string for context
    column_info = []
    for column in columns:
        column_name = list(column.keys())[0]
        column_type = column[column_name]
        column_info.append(f"{column_name} ({column_type})")
    
    columns_str = ", ".join(column_info)
    
    # Create sample data string for context
    sample_titles = []
    for sample in sample_data[:3]:  # Use first 3 samples for context
        # Try to find a descriptive field (title, name, etc.)
        for field in ['title', 'name', 'description', 'label']:
            if field in sample and sample[field]:
                sample_titles.append(str(sample[field]))
                break
    
    sample_str = ", ".join(sample_titles) if sample_titles else "No sample data available"
    
    prompt = f"Table: {table_name}, Columns: {columns_str}, Sample Data: {sample_str}"
    
    try:
        description = await call_llm_agent(prompt)
    except Exception as e:
        print(f"Error generating description for {table_name}: {str(e)}")
        description = f"This table stores and manages {table_name} related data with associated metadata and relationships."
    
    return description

def read_schema_from_file(file_path: str) -> Dict:
    """
    Load schema from a Python file.
    
    Args:
        file_path (str): Path to the schema Python file
    
    Returns:
        dict: Schema dictionary or None if error
    """
    try:
        spec = importlib.util.spec_from_file_location("schema_module", file_path)
        schema_module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(schema_module)
        return schema_module.schema
    except Exception as e:
        print(f"Error loading schema from {file_path}: {e}")
        return None
    
def write_schema_to_file(file_path: str, schema: Dict):
    """
    Write the updated schema back to the Python file
    """
    try:
        # Read the original file content
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Convert schema to a properly formatted string
        schema_str = format_schema_for_file(schema)
        
        # Replace the schema definition
        updated_content = re.sub(
            r'schema\s*=\s*{.*?}',
            f'schema = {schema_str}',
            content,
            flags=re.DOTALL
        )
        
        # Write back to file
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(updated_content)
            
        print(f"Updated schema saved to {file_path}")
        
    except Exception as e:
        print(f"Error writing schema to {file_path}: {str(e)}")

def format_schema_for_file(schema: Dict) -> str:
    """
    Format schema dictionary as a properly indented Python dictionary string
    """
    def format_value(value, indent=0):
        spaces = "    " * indent
        if isinstance(value, dict):
            if not value:
                return "{}"
            lines = ["{"]
            for k, v in value.items():
                formatted_v = format_value(v, indent + 1)
                if isinstance(v, str):
                    lines.append(f'{spaces}    "{k}": "{formatted_v}",')
                else:
                    lines.append(f'{spaces}    "{k}": {formatted_v},')
            lines.append(f"{spaces}}}")
            return "\n".join(lines)
        elif isinstance(value, list):
            if not value:
                return "[]"
            lines = ["["]
            for item in value:
                formatted_item = format_value(item, indent + 1)
                lines.append(f"{spaces}    {formatted_item},")
            lines.append(f"{spaces}]")
            return "\n".join(lines)
        elif isinstance(value, str):
            return value
        else:
            return repr(value)
    
    return format_value(schema)

async def update_schema_with_descriptions(schema: Dict) -> Dict:
    """
    Update the schema dictionary to include a single description for each table.
    """
    updated_schema = {}
    
    for table_name, table_info in schema.items():
        print(f"Processing table: {table_name}")
        
        # Generate description for this table
        table_description = await generate_table_description(
            table_name, 
            table_info["columns"], 
            table_info["sample"]
        )
        
        # Create updated table structure with original column format
        columns = {}
        for column in table_info["columns"]:
            column_name = list(column.keys())[0]
            column_type = column[column_name]
            columns[column_name] = column_type
        
        updated_schema[table_name] = {
            "columns": columns,
            "sample": table_info["sample"],
            "description": table_description
        }
    
    return updated_schema

async def process_database_files():
    """
    Process all database schema files in the current directory
    """
    # List of database files to process
    db_files = ["./schemas/chinook_db.py", "./schemas/netflix_db.py"]
    
    for db_file in db_files:
        if os.path.exists(db_file):
            print(f"\n{'='*50}")
            print(f"Processing {db_file}")
            print(f"{'='*50}")
            
            # Read schema from file
            schema = read_schema_from_file(db_file)
            
            if schema:
                # Update schema with descriptions
                updated_schema = await update_schema_with_descriptions(schema)
                
                # Write back to file
                write_schema_to_file(db_file, updated_schema)
                
                print(f"\nCompleted processing {db_file}")
                print(f"Tables processed: {list(updated_schema.keys())}")
            else:
                print(f"No schema found in {db_file}")
        else:
            print(f"File {db_file} not found in current directory")

if __name__ == "__main__":
    import asyncio
    
    try:
        # Try to get the running loop (if in async context)
        loop = asyncio.get_running_loop()
    except RuntimeError:  # No running loop
        loop = None
    
    if loop:
        # If in async context (like Jupyter), create a task
        task = loop.create_task(process_database_files())
        # In Jupyter you might want to await the task:
        # await task  # (uncomment if in async context)
    else:
        # If not in async context, use asyncio.run()
        asyncio.run(process_database_files())
    
    print("\n" + "="*50)
    print("All database schema files have been processed!")
    print("="*50)


All database schema files have been processed!



Processing ./schemas/chinook_db.py
Processing table: Album
Error calling LLM agent: status_code: 401, model_name: gpt-4.1, body: {'code': '401', 'message': 'Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.'}
Processing table: Artist
Error calling LLM agent: status_code: 401, model_name: gpt-4.1, body: {'code': '401', 'message': 'Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.'}
Processing table: Customer
Error calling LLM agent: status_code: 401, model_name: gpt-4.1, body: {'code': '401', 'message': 'Access denied due to invalid subscription key or wrong API endpoint. Make sure to provide a valid key for an active subscription and use a correct regional API endpoint for your resource.'}
Processing table: Employ