In [7]:
import pandas as pd
df=pd.read_csv("../data/synthetic_heterogeneous_pack_scaled/matters_A.csv")


In [8]:
json_head=df.head().to_json(orient="records", indent=2)


In [None]:
from llm.unified_client import get_llm_client

llm_client = get_llm_client()

system_prompt = """You are an experienced data engineer specializing in dataset analysis and documentation.

Your task is to analyze a given dataset and create comprehensive column descriptions that accurately represent what each column contains.

**Your Responsibilities:**
1. Analyze each column in the provided dataset
2. Understand the data type, format, and semantic meaning of each column
3. Create clear, concise, and accurate descriptions
4. Identify realistic examples from the data
5. Suggest similar keywords that might be used to refer to the same concept

**Output Requirements:**
- Your response MUST be valid JSON format only
- Do NOT include any explanatory text, markdown formatting, or code blocks
- Return a JSON array where each object represents one column
- Each column object must contain exactly these fields:
  - `name`: The exact column name as it appears in the dataset
  - `description`: A clear, concise description of what the column represents
  - `example`: A realistic example value from the data (use actual values when possible)
  - `similar_keywords`: An array of 3-5 alternative terms that could refer to the same concept

**Guidelines for Descriptions:**
- Be specific and accurate based on the actual data
- Use professional, clear language
- Indicate data types when relevant (e.g., "Date when...", "Numeric identifier for...")
- For date fields, note the format if apparent
- For identifiers, specify what they identify

**Example Output Format:**
[
  {
    "name": "matter_id",
    "description": "Unique identifier for the legal matter",
    "example": "MAT-1001",
    "similar_keywords": ["matter code", "case_id", "file_number"]
  },
  {
    "name": "client_ref",
    "description": "Identifier for the client associated with the matter",
    "example": "CL-1001",
    "similar_keywords": ["client code", "customer_ref", "account_id"]
  }
]

Remember: Output ONLY the JSON array, nothing else."""

user_prompt_template = """Analyze the following dataset sample and provide column descriptions in the specified JSON format.

<data>
{data_sample}
</data>

Provide the column descriptions as a JSON array."""



In [20]:
response=llm_client.generate(
    prompt=user_prompt_template,
    system_prompt=system_prompt,
    temperature=0.0,
    max_tokens=2000
)



In [21]:
import json
from rich import print
print(json.loads(response))


In [None]:
# Test the Description Generator
from description_generator import DescriptionGenerator, generate_description
import json

# Initialize the generator
generator = DescriptionGenerator()

# Test with the CSV file
test_file = "../data/synthetic_heterogeneous_pack_scaled/matters_A.csv"
print(f"Testing description generator with: {test_file}")
print("=" * 80)

# Generate descriptions
result = generator.generate(test_file)

# Display the result
print("\nGenerated Description:")
print("=" * 80)
print(json.dumps(result, indent=2))

# Display summary
print("\n" + "=" * 80)
print("Summary:")
print("=" * 80)
print(f"Filename: {result['filename']}")
print(f"Number of columns: {len(result['columns'])}")
print("\nColumns:")
for col in result['columns']:
    print(f"\n  {col['name']}:")
    print(f"    Type: {col.get('data_type', 'unknown')}")
    print(f"    Description: {col['description']}")
    print(f"    Example: {col['example']}")
    print(f"    Similar keywords: {', '.join(col['similar_keywords'])}")


In [None]:
# Function to generate descriptions for all files except .txt files in parallel
from pathlib import Path
from description_generator import DescriptionGenerator
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from typing import List, Dict, Any

def process_single_file(
    file_path: Path,
    data_path: Path,
    output_path: Path,
    generator: DescriptionGenerator
) -> Dict[str, Any]:
    """
    Process a single file and generate its description.
    This function is designed to be run in parallel.
    
    Args:
        file_path: Path to the file to process
        data_path: Root data directory
        output_path: Output directory for results
        generator: DescriptionGenerator instance
    
    Returns:
        Dictionary with processing result
    """
    try:
        # Generate description
        result = generator.generate(file_path)
        
        # Create output filename
        relative_path = file_path.relative_to(data_path)
        output_filename = f"{relative_path.stem}_descriptions.json"
        output_file = output_path / output_filename
        
        # Create subdirectories if needed
        output_file.parent.mkdir(parents=True, exist_ok=True)
        
        # Save to file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        
        return {
            'status': 'success',
            'input_file': str(file_path),
            'output_file': str(output_file),
            'filename': result.get('filename', ''),
            'columns_count': len(result.get('columns', [])),
            'result': result  # Store the full result
        }
    except Exception as e:
        return {
            'status': 'error',
            'input_file': str(file_path),
            'error': str(e),
            'error_type': type(e).__name__
        }

def generate_descriptions_for_all_files(
    data_dir: str = "../data/synthetic_heterogeneous_pack_scaled",
    output_dir: str = "../data/synthetic_heterogeneous_pack_scaled/descriptions",
    exclude_extensions: List[str] = ['.txt', '.text'],
    exclude_dirs: List[str] = ['documents', 'filings', 'regulations', 'billing_files'],
    max_workers: int = 2
) -> Dict[str, Any]:
    """
    Generate descriptions for all files in the data directory except .txt files.
    Processes files in parallel for better performance.
    
    Args:
        data_dir: Root directory to search for files
        output_dir: Directory to save description JSON files
        exclude_extensions: List of file extensions to exclude (default: .txt, .text)
        exclude_dirs: List of directory names to exclude from search
        max_workers: Maximum number of parallel workers (default: 2, reduced to avoid rate limits)
    
    Returns:
        Dictionary with results containing:
        - total: Total number of files found
        - successful: List of successful results
        - errors: List of error results
        - all_results: List of all results (successful + errors)
    """
    data_path = Path(data_dir)
    output_path = Path(output_dir)
    
    # Create output directory if it doesn't exist
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Find all files recursively, excluding .txt files and specific directories
    all_files = []
    for file_path in data_path.rglob('*'):
        # Skip directories
        if file_path.is_dir():
            continue
        
        # Skip if in excluded directory
        if any(excluded_dir in file_path.parts for excluded_dir in exclude_dirs):
            continue
        
        # Skip if extension is excluded
        if file_path.suffix.lower() in [ext.lower() for ext in exclude_extensions]:
            continue
        
        # Only process files that have extensions (skip files without extensions)
        if file_path.suffix:
            all_files.append(file_path)
    
    print(f"Found {len(all_files)} files to process (excluding .txt files)")
    print(f"Using {max_workers} parallel workers")
    print("=" * 80)
    
    # Initialize generator (shared across threads)
    generator = DescriptionGenerator()
    
    # Store all results
    all_results: List[Dict[str, Any]] = []
    successful_results: List[Dict[str, Any]] = []
    error_results: List[Dict[str, Any]] = []
    
    # Process files in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_file = {
            executor.submit(process_single_file, file_path, data_path, output_path, generator): file_path
            for file_path in all_files
        }
        
        # Process completed tasks with progress bar
        with tqdm(total=len(all_files), desc="Generating descriptions") as pbar:
            for future in as_completed(future_to_file):
                file_path = future_to_file[future]
                try:
                    result = future.result()
                    all_results.append(result)
                    
                    if result['status'] == 'success':
                        successful_results.append(result)
                    else:
                        error_results.append(result)
                        print(f"\n✗ Error processing {file_path}: {result.get('error', 'Unknown error')}")
                    
                except Exception as e:
                    error_result = {
                        'status': 'error',
                        'input_file': str(file_path),
                        'error': str(e),
                        'error_type': type(e).__name__
                    }
                    all_results.append(error_result)
                    error_results.append(error_result)
                    print(f"\n✗ Exception processing {file_path}: {e}")
                
                pbar.update(1)
    
    # Compile final results
    final_results = {
        'total': len(all_files),
        'successful_count': len(successful_results),
        'error_count': len(error_results),
        'successful': successful_results,
        'errors': error_results,
        'all_results': all_results,
        'output_directory': str(output_path)
    }
    
    # Print summary
    print("\n" + "=" * 80)
    print("Summary:")
    print("=" * 80)
    print(f"Total files processed: {final_results['total']}")
    print(f"Successful: {final_results['successful_count']}")
    print(f"Errors: {final_results['error_count']}")
    
    print(f"\nOutput directory: {output_path}")
    
    if successful_results:
        print(f"\n✓ Successfully processed {len(successful_results)} files:")
        for item in successful_results[:10]:  # Show first 10
            print(f"  • {item['filename']} ({item['columns_count']} columns)")
        if len(successful_results) > 10:
            print(f"  ... and {len(successful_results) - 10} more")
    
    if error_results:
        print(f"\n✗ {len(error_results)} files had errors:")
        for item in error_results[:5]:  # Show first 5 errors
            print(f"  • {item['input_file']}: {item.get('error', 'Unknown error')}")
        if len(error_results) > 5:
            print(f"  ... and {len(error_results) - 5} more errors")
    
    return final_results

# Run the function
# Using max_workers=2 to reduce rate limiting issues
# The function now includes retry logic with exponential backoff for rate limits
results = generate_descriptions_for_all_files(max_workers=2)


In [None]:
import xml.etree.ElementTree as ET

def get_top_5_generic(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # collect all elements
    all_elements = list(root.iter())

    # skip the root element itself
    all_elements = all_elements[1:]

    top5 = []
    for elem in all_elements[:5]:
        top5.append(ET.tostring(elem, encoding='unicode'))

    return top5


# Example usage
xml_file = "../data/synthetic_heterogeneous_pack_scaled/structured_clients_C.xml"
top5 = get_top_5_generic(xml_file)

for i, entry in enumerate(top5, 1):
    print(f"\nEntry {i}:\n{entry}")


FileNotFoundError: [Errno 2] No such file or directory: '../data/synthetic_heterogeneous_pack_scaled/filings/filings_A.xml'