In [1]:
# Interactive function to analyze any file and show extracted data + final description
from description_generator import DescriptionGenerator
from processors.factory import get_processor
from pathlib import Path
import json
from rich.console import Console
from rich.panel import Panel
from rich.syntax import Syntax

console = Console()

def analyze_file(file_path: str):
    """
    Analyze a file and display:
    1. The extracted data sample that was sent to the LLM
    2. The final description generated
    
    Args:
        file_path: Path to the file to analyze
    """
    console.print(f"\n[bold cyan]Analyzing file:[/bold cyan] {file_path}")
    console.print("=" * 80)
    
    try:
        # Initialize generator
        generator = DescriptionGenerator()
        
        # Get processor to extract data sample
        processor = get_processor(file_path)
        file_path_obj = Path(file_path)
        
        # Extract data sample (same as what description_generator uses internally)
        data_sample, data_types = generator._prepare_data_sample(file_path_obj, processor)
        
        # Display extracted data sample
        console.print("\n[bold green]1. Extracted Data Sample (sent to LLM):[/bold green]")
        console.print("-" * 80)
        
        # Format the data sample nicely
        if isinstance(data_sample, str):
            try:
                # Try to parse and pretty-print if it's JSON
                parsed = json.loads(data_sample)
                formatted_json = json.dumps(parsed, indent=2, ensure_ascii=False)
                syntax = Syntax(formatted_json, "json", theme="monokai", line_numbers=True)
                console.print(syntax)
            except:
                # If not JSON, just print as text
                console.print(Panel(data_sample[:2000], title="Data Sample", border_style="green"))
                if len(data_sample) > 2000:
                    console.print(f"[dim]... (truncated, total length: {len(data_sample)} characters)[/dim]")
        else:
            console.print(json.dumps(data_sample, indent=2, ensure_ascii=False))
        
        # Display data types
        if data_types:
            console.print("\n[bold yellow]Data Types Detected:[/bold yellow]")
            console.print("-" * 80)
            for key, dtype in data_types.items():
                console.print(f"  {key}: [cyan]{dtype}[/cyan]")
        
        # Generate final description
        console.print("\n[bold blue]2. Generating Description...[/bold blue]")
        console.print("-" * 80)
        
        result = generator.generate(file_path)
        
        # Display final description
        console.print("\n[bold green]3. Final Description Generated:[/bold green]")
        console.print("-" * 80)
        
        # Pretty print the result - use print to avoid truncation
        result_json = json.dumps(result, indent=2, ensure_ascii=False)
        
        # Print full JSON without truncation
        print("\nFull JSON Output:")
        print("=" * 80)
        print(result_json)
        
        # Also show with rich syntax (may be truncated in display)
        console.print("\n[dim]Formatted view (may be truncated):[/dim]")
        syntax = Syntax(result_json, "json", theme="monokai", line_numbers=True, word_wrap=True)
        console.print(syntax)
        
        # Summary
        console.print("\n[bold magenta]Summary:[/bold magenta]")
        console.print("-" * 80)
        console.print(f"  File: [cyan]{result['filename']}[/cyan]")
        if 'file' in result:
            console.print(f"  File Name: [cyan]{result['file'].get('name', 'N/A')}[/cyan]")
            console.print(f"  File Description: {result['file'].get('description', 'N/A')}")
        console.print(f"  Columns: [cyan]{len(result['columns'])}[/cyan]")
        console.print("\n  Column Details:")
        for col in result['columns']:
            console.print(f"    â€¢ [bold]{col['name']}[/bold] ({col.get('data_type', 'unknown')})")
            # Show full description without truncation
            desc = col.get('description', 'N/A')
            if len(desc) > 100:
                console.print(f"      Description: {desc[:100]}...")
                console.print(f"      Full Description: [dim]{desc}[/dim]")
            else:
                console.print(f"      Description: {desc}")
            console.print(f"      Example: [dim]{col.get('example', 'N/A')}[/dim]")
        
        return result
        
    except Exception as e:
        console.print(f"\n[bold red]Error:[/bold red] {str(e)}")
        import traceback
        console.print(Panel(traceback.format_exc(), title="Traceback", border_style="red"))
        raise

# Example usage:
# analyze_file("../data/synthetic_heterogeneous_pack_scaled/matters_A.csv")
# analyze_file("../data/synthetic_heterogeneous_pack_scaled/structured_clients_B.json")


In [3]:
# Example: Analyze a CSV file
analyze_file("../data/synthetic_heterogeneous_pack_scaled/billing_entries_A.csv")



Full JSON Output:
{
  "filename": "billing_entries_A.csv",
  "file_path": "../data/synthetic_heterogeneous_pack_scaled/billing_entries_A.csv",
  "file": {
    "name": "Billing Entries",
    "description": "A dataset containing information about billing entries for legal services provided by attorneys. Each record represents a single billing entry with details about the associated legal matter, attorney, hours worked, rate, and description of the service provided."
  },
  "columns": [
    {
      "name": "entry_id",
      "description": "Unique identifier for the billing entry",
      "example": "BL-692060",
      "similar_keywords": [
        "billing_code",
        "entry_code",
        "invoice_id"
      ],
      "data_type": "str"
    },
    {
      "name": "file_id",
      "description": "Identifier for the legal matter or case associated with the billing entry",
      "example": "MAT-1001",
      "similar_keywords": [
        "matter_code",
        "case_id",
        "file_number

{'filename': 'billing_entries_A.csv',
 'file_path': '../data/synthetic_heterogeneous_pack_scaled/billing_entries_A.csv',
 'file': {'name': 'Billing Entries',
  'description': 'A dataset containing information about billing entries for legal services provided by attorneys. Each record represents a single billing entry with details about the associated legal matter, attorney, hours worked, rate, and description of the service provided.'},
 'columns': [{'name': 'entry_id',
   'description': 'Unique identifier for the billing entry',
   'example': 'BL-692060',
   'similar_keywords': ['billing_code', 'entry_code', 'invoice_id'],
   'data_type': 'str'},
  {'name': 'file_id',
   'description': 'Identifier for the legal matter or case associated with the billing entry',
   'example': 'MAT-1001',
   'similar_keywords': ['matter_code', 'case_id', 'file_number'],
   'data_type': 'str'},
  {'name': 'att_id',
   'description': 'Identifier for the attorney who provided the service',
   'example': 'A

In [None]:
# Find all CSV, JSON, and XLSX files in the scaled dataset and analyze them
from pathlib import Path
import json

# Base directory
data_dir = Path("../data/synthetic_heterogeneous_pack_scaled")

# Find all CSV, JSON, and XLSX files
file_extensions = ['.csv', '.json', '.xlsx', '.xls']
all_files = []

for ext in file_extensions:
    all_files.extend(list(data_dir.rglob(f"*{ext}")))

# Filter out files in excluded directories
exclude_dirs = ['documents', 'filings', 'regulations', 'billing_files', 'descriptions']
filtered_files = [
    f for f in all_files 
    if not any(excluded_dir in f.parts for excluded_dir in exclude_dirs)
]

# Sort files for consistent ordering
filtered_files = sorted(filtered_files)

print(f"Found {len(filtered_files)} files to analyze:")
for f in filtered_files:
    print(f"  - {f.relative_to(data_dir)}")

# Store all results
all_results = []

# Analyze each file
for file_path in filtered_files:
    print(f"\n{'='*80}")
    print(f"Processing: {file_path.relative_to(data_dir)}")
    print(f"{'='*80}")
    
    try:
        result = analyze_file(str(file_path))
        all_results.append({
            'file_path': str(file_path),
            'relative_path': str(file_path.relative_to(data_dir)),
            'status': 'success',
            'result': result
        })
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        all_results.append({
            'file_path': str(file_path),
            'relative_path': str(file_path.relative_to(data_dir)),
            'status': 'error',
            'error': str(e),
            'error_type': type(e).__name__
        })

# Create summary structure
results_summary = {
    'total_files': len(filtered_files),
    'successful': len([r for r in all_results if r['status'] == 'success']),
    'errors': len([r for r in all_results if r['status'] == 'error']),
    'files': all_results
}

print(f"\n{'='*80}")
print("SUMMARY")
print(f"{'='*80}")
print(f"Total files processed: {results_summary['total_files']}")
print(f"Successful: {results_summary['successful']}")
print(f"Errors: {results_summary['errors']}")

# Store in a variable for later use
analyzed_files_results = results_summary


In [6]:
# Save results to JSON file and show how to access them
import json
from datetime import datetime

# Save to JSON file
output_file = data_dir / "all_file_analyses.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(analyzed_files_results, f, indent=2, ensure_ascii=False, default=str)

print(f"Results saved to: {output_file}")

# Show how to access the results
print("\n" + "="*80)
print("How to access results:")
print("="*80)
print("\n1. Access all results:")
print("   analyzed_files_results['files']")
print(f"\n2. Access successful results only:")
print("   [f for f in analyzed_files_results['files'] if f['status'] == 'success']")
print(f"\n3. Access a specific file's description:")
print("   analyzed_files_results['files'][0]['result']['columns']")
print(f"\n4. Get summary stats:")
print(f"   Total: {analyzed_files_results['total_files']}")
print(f"   Successful: {analyzed_files_results['successful']}")
print(f"   Errors: {analyzed_files_results['errors']}")

# Show first successful result as example
successful_results = [f for f in analyzed_files_results['files'] if f['status'] == 'success']
if successful_results:
    print(f"\n5. Example - First successful file:")
    first = successful_results[0]
    print(f"   File: {first['relative_path']}")
    print(f"   Columns: {len(first['result']['columns'])}")
    print(f"   Column names: {[c['name'] for c in first['result']['columns']]}")


Results saved to: ../data/synthetic_heterogeneous_pack_scaled/all_file_analyses.json

How to access results:

1. Access all results:
   analyzed_files_results['files']

2. Access successful results only:
   [f for f in analyzed_files_results['files'] if f['status'] == 'success']

3. Access a specific file's description:
   analyzed_files_results['files'][0]['result']['columns']

4. Get summary stats:
   Total: 7
   Successful: 7
   Errors: 0

5. Example - First successful file:
   File: billing_entries_A.csv
   Columns: 8
   Column names: ['entry_id', 'file_id', 'att_id', 'hours', 'rate', 'amount', 'description', 'entry_date']
