In [14]:
import os
import re
from typing import Dict, List
from model.load_model import description_agent
import importlib

In [15]:
async def call_llm_agent(prompt: str) -> str:
    """
    Use the pydantic_ai agent to process the prompt and return a description.
    """
    try:
        # Run the agent with the prompt
        result = await description_agent.run(prompt)
        return result.output
    except Exception as e:
        print(f"Error calling LLM agent: {str(e)}")
        # Fallback to mock implementation
        table_name = prompt.split("Table: ")[1].split(",")[0]
        return f"This table stores and manages {table_name} related data with associated metadata and relationships."


In [16]:

async def generate_table_description(table_name: str, columns: Dict, sample_data: List[Dict]) -> str:
    """
    Generate a single description for the entire table using LLM agent.
    """
    # Create column info string for context
    column_info = []
    for column_name, column_type in columns.items():
        column_info.append(f"{column_name} ({column_type})")
    
    columns_str = ", ".join(column_info)
    
    # Create sample data string for context
    sample_titles = []
    for sample in sample_data[:3]:  # Use first 3 samples for context
        # Try to find a descriptive field (title, name, etc.)
        for field in ['title', 'name', 'description', 'label']:
            if field in sample and sample[field]:
                sample_titles.append(str(sample[field]))
                break
    
    sample_str = ", ".join(sample_titles) if sample_titles else "No sample data available"
    
    prompt = f"Table: {table_name}, Columns: {columns_str}, Sample Data: {sample_str}"
    
    try:
        description = await call_llm_agent(prompt)
    except Exception as e:
        print(f"Error generating description for {table_name}: {str(e)}")
        description = f"This table stores and manages {table_name} related data with associated metadata and relationships."
    
    return description


In [17]:

def read_schema_from_file(file_path: str) -> Dict:
    """
    Load schema from a Python file.
    
    Args:
        file_path (str): Path to the schema Python file
    
    Returns:
        dict: Schema dictionary or None if error
    """
    try:
        spec = importlib.util.spec_from_file_location("schema_module", file_path)
        schema_module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(schema_module)
        return schema_module.schema
    except Exception as e:
        print(f"Error loading schema from {file_path}: {e}")
        return None


In [18]:

def format_schema_for_file(schema: Dict, indent_level: int = 0) -> str:
    """
    Format schema dictionary as a properly indented Python dictionary string
    """
    if not schema:
        return "{}"
    
    indent = "    " * indent_level
    next_indent = "    " * (indent_level + 1)
    
    lines = ["{"]
    
    for key, value in schema.items():
        if isinstance(value, dict):
            # Handle nested dictionaries
            formatted_value = format_schema_for_file(value, indent_level + 1)
            lines.append(f'{next_indent}"{key}": {formatted_value},')
        elif isinstance(value, list):
            # Handle lists
            if not value:
                lines.append(f'{next_indent}"{key}": [],')
            else:
                lines.append(f'{next_indent}"{key}": [')
                for item in value:
                    if isinstance(item, dict):
                        formatted_item = format_schema_for_file(item, indent_level + 2)
                        lines.append(f'{next_indent}    {formatted_item},')
                    else:
                        lines.append(f'{next_indent}    {repr(item)},')
                lines.append(f'{next_indent}],')
        elif isinstance(value, str):
            # Handle strings - escape quotes properly
            escaped_value = value.replace('"', '\\"')
            lines.append(f'{next_indent}"{key}": "{escaped_value}",')
        else:
            # Handle other types (int, float, bool, None)
            lines.append(f'{next_indent}"{key}": {repr(value)},')
    
    lines.append(f"{indent}}}")
    return "\n".join(lines)



In [19]:

def write_schema_to_file(file_path: str, schema: Dict):
    """
    Write the updated schema with descriptions to a completely new Python file.
    
    Args:
        file_path (str): Path to the schema Python file
        schema (Dict): Updated schema dictionary containing descriptions
    """
    try:
        # Format the schema dictionary as a Python-compatible string
        schema_str = format_schema_for_file(schema)
        
        # Create the complete file content
        file_content = f"schema = {schema_str}\n"
        
        # Delete the old file if it exists and write the new one
        if os.path.exists(file_path):
            os.remove(file_path)
        
        # Write the new file
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(file_content)
            
        print(f"New schema file with descriptions created at {file_path}")
        
    except Exception as e:
        print(f"Error writing schema to {file_path}: {str(e)}")


In [20]:
async def update_schema_with_descriptions(schema: Dict) -> Dict:
    """
    Update the schema dictionary to include a single description for each table.
    """
    updated_schema = {}
    
    for table_name, table_info in schema.items():
        print(f"Processing table: {table_name}")
        
        # Ensure table_info has the expected structure
        if not isinstance(table_info, dict):
            print(f"Skipping {table_name}: invalid table structure")
            continue
            
        columns = table_info.get("columns", {})
        sample_data = table_info.get("sample", [])
        
        # Generate description for this table
        table_description = await generate_table_description(
            table_name, 
            columns, 
            sample_data
        )
        
        # Create updated table structure
        updated_schema[table_name] = {
            "columns": columns,
            "sample": sample_data,
            "description": table_description
        }
    
    return updated_schema



In [21]:
async def process_database_files():
    """
    Process all database schema files in the current directory
    """
    # List of database files to process
    db_files = ["./schemas/chinook_db.py", "./schemas/netflix_db.py"]
    
    for db_file in db_files:
        if os.path.exists(db_file):
            print(f"\n{'='*50}")
            print(f"Processing {db_file}")
            print(f"{'='*50}")
            
            # Read schema from file
            schema = read_schema_from_file(db_file)
            
            if schema:
                # Clean the schema first to remove any malformed data
                cleaned_schema = {}
                for table_name, table_info in schema.items():
                    if isinstance(table_info, dict) and "columns" in table_info:
                        cleaned_schema[table_name] = {
                            "columns": table_info.get("columns", {}),
                            "sample": table_info.get("sample", [])
                        }
                
                # Update schema with descriptions
                updated_schema = await update_schema_with_descriptions(cleaned_schema)
                
                # Write back to file
                write_schema_to_file(db_file, updated_schema)
                
                print(f"\nCompleted processing {db_file}")
                print(f"Tables processed: {list(updated_schema.keys())}")
            else:
                print(f"No schema found in {db_file}")
        else:
            print(f"File {db_file} not found in current directory")



In [None]:
if __name__ == "__main__":
    import asyncio
    
    try:
        # Try to get the running loop (if in async context)
        loop = asyncio.get_running_loop()
    except RuntimeError:  # No running loop
        loop = None
    
    if loop:
        # If in async context (like Jupyter), create a task
        task = loop.create_task(process_database_files())
        # In Jupyter you might want to await the task:
        # await task  # (uncomment if in async context)
    else:
        # If not in async context, use asyncio.run()
        asyncio.run(process_database_files())
    
    print("\n" + "="*50)
    print("All database schema files have been processed!")
    print("="*50)


All database schema files have been processed!



Processing ./schemas/chinook_db.py
Processing table: Album
Processing table: Artist
Processing table: Customer
Processing table: Employee
Processing table: Genre
Processing table: Invoice
Processing table: InvoiceLine
Processing table: MediaType
Processing table: Playlist
Processing table: PlaylistTrack
Processing table: Track
New schema file with descriptions created at ./schemas/chinook_db.py

Completed processing ./schemas/chinook_db.py
Tables processed: ['Album', 'Artist', 'Customer', 'Employee', 'Genre', 'Invoice', 'InvoiceLine', 'MediaType', 'Playlist', 'PlaylistTrack', 'Track']

Processing ./schemas/netflix_db.py
Processing table: movie
Processing table: season
Processing table: tv_show
Processing table: view_summary
New schema file with descriptions created at ./schemas/netflix_db.py

Completed processing ./schemas/netflix_db.py
Tables processed: ['movie', 'season', 'tv_show', 'view_summary']
