In [None]:
import json

In [17]:
import json
import re

def clean_and_format_samples(input_data):
    """
    Format the spatial analysis samples into proper JSON format with specific cleaning steps:
    1. Remove "Parcel IDs: " from answers
    2. Remove "Python\\n" from code
    3. Remove "()" from queries and convert queries to lowercase
    4. Normalize multiple newlines to single newlines

    Args:
        input_data: String containing the raw samples

    Returns:
        List of properly formatted sample dictionaries
    """
    # Split the input by "Sample" to identify each sample
    sample_sections = re.split(r'Sample \d+:', input_data)

    # Skip the first empty section if it exists
    if not sample_sections[0].strip():
        sample_sections = sample_sections[1:]

    samples = []

    for section in sample_sections:
        section = section.strip()
        if not section:
            continue

        # Find the Query, Code, and Answer parts
        query_match = re.search(r'Query: "(.*?)"', section, re.DOTALL)
        code_match = re.search(r'Code:"(.*?)"', section, re.DOTALL)
        answer_match = re.search(r'Answer:"(.*?)"', section, re.DOTALL)

        # If Answer appears with no quote after it
        if not answer_match:
            answer_match = re.search(r'Answer:(.*?)(?:\n\]|\Z)', section, re.DOTALL)

        # Create a sample dictionary
        sample = {}

        if query_match:
            query = query_match.group(1).strip()
            # Clean query: remove parentheses and convert to lowercase
            query = query.replace("(", "").replace(")", "").lower()
            sample["Query"] = query
        else:
            # Try an alternative pattern
            alt_query = re.search(r'Query: (.*?)(?:\n|$)', section, re.DOTALL)
            if alt_query:
                query = alt_query.group(1).strip()
                # Clean query: remove parentheses and convert to lowercase
                query = query.replace("(", "").replace(")", "").lower()
                sample["Query"] = query
            else:
                sample["Query"] = ""

        if code_match:
            code = code_match.group(1).strip()
            # Clean code: remove "Python\n" prefix
            code = re.sub(r'^Python\s*\n', '', code)
            # Normalize multiple newlines to single newlines
            code = re.sub(r'\n{2,}', '\n', code)
            sample["Code"] = code
        else:
            # Try an alternative pattern
            alt_code = re.search(r'Code:(.*?)(?:\nAnswer|\Z)', section, re.DOTALL)
            if alt_code:
                code = alt_code.group(1).strip()
                # Clean code: remove "Python\n" prefix
                code = re.sub(r'^Python\s*\n', '', code)
                # Normalize multiple newlines to single newlines
                code = re.sub(r'\n{2,}', '\n', code)
                sample["Code"] = code
            else:
                sample["Code"] = ""

        if answer_match:
            answer = answer_match.group(1).strip()
            # Clean answer: remove "Parcel IDs: " prefix
            answer = re.sub(r'^Parcel IDs: ', '', answer)
            # Normalize multiple newlines to single newlines
            answer = re.sub(r'\n{2,}', '\n', answer)
            sample["Answer"] = answer
        else:
            # Extract what comes after "Answer:" until the end
            if "Answer:" in section:
                answer = section.split("Answer:", 1)[1].strip()
                # Clean answer: remove "Parcel IDs: " prefix
                answer = re.sub(r'^Parcel IDs: ', '', answer)
                # Normalize multiple newlines to single newlines
                answer = re.sub(r'\n{2,}', '\n', answer)
                sample["Answer"] = answer
            else:
                sample["Answer"] = ""

        # Strip extra quotes from the beginning and end
        for key in sample:
            if sample[key].startswith('"') and sample[key].endswith('"'):
                sample[key] = sample[key][1:-1]
            sample[key] = sample[key].strip()

        if sample and (sample.get("Query") or sample.get("Code") or sample.get("Answer")):
            samples.append(sample)

    return samples

def write_formatted_json(input_file, output_file):
    """
    Write formatted JSON to a file.

    Args:
        input_file: Path to the input file with raw samples
        output_file: Path to the output JSON file
    """
    try:
        with open(input_file, 'r', encoding='utf-8-sig') as f:  # Handle UTF-8 BOM if present
            input_data = f.read()

        # Format samples
        samples = clean_and_format_samples(input_data)

        # Write to file
        with open(output_file, 'w') as f:
            json.dump(samples, indent=2, fp=f)

        print(f"Successfully formatted {len(samples)} samples and wrote to {output_file}")

        # Print sample of the first few entries
        print("\nSample format examples:")
        for i, sample in enumerate(samples[:2]):
            print(f"Sample {i+1}:")
            query = sample.get("Query", "")[:50] + "..." if len(sample.get("Query", "")) > 50 else sample.get("Query", "")
            code_snippet = sample.get("Code", "")[:50].replace('\n', ' ') + "..." if len(sample.get("Code", "")) > 50 else sample.get("Code", "")
            answer_snippet = sample.get("Answer", "")[:50].replace('\n', ' ') + "..." if len(sample.get("Answer", "")) > 50 else sample.get("Answer", "")

            print(f"  Query: {query}")
            print(f"  Code snippet: {code_snippet}")
            print(f"  Answer snippet: {answer_snippet}")
            print()

        return samples

    except Exception as e:
        print(f"Error formatting samples: {e}")
        return []

# Example usage
if __name__ == "__main__":
    # Check if input file is provided as command line argument
    # import sys
    # input_file = sys.argv[1] if len(sys.argv) > 1 else "paste.txt"
    # output_file = sys.argv[2] if len(sys.argv) > 2 else "spatial_samples.json"

    # Write formatted JSON to a file
    write_formatted_json("paste.txt", "spatial_samples.json")

Successfully formatted 0 samples and wrote to spatial_samples.json

Sample format examples:
