In [2]:
import os
import json
import weaviate
from weaviate.classes.init import Auth
from typing import Optional
collection_name = 'Vmi_docs'

class WeaviateJSONEncoder(json.JSONEncoder):
    """Custom JSON encoder to handle Weaviate specific types."""

    def default(self, obj):
        # Convert UUID objects to strings
        if hasattr(obj, '__str__'):
            return str(obj)
        return json.JSONEncoder.default(self, obj)


def export_weaviate_data(
        output_file: str,
        wcd_url: str = 'https://xh1j9trzu5cervreztxw.c0.europe-west3.gcp.weaviate.cloud',
        api_key: str = 'uTTyayyrfwyn98zBq6ukAcIAVnEJjkBWMLac',
        collection_name: str = collection_name,
        batch_size: int = 100
):
    """
    Export all data from a Weaviate collection to a JSON file.

    Args:
        output_file (str): Path to the output JSON file
        wcd_url (str): Weaviate cluster URL
        api_key (str): API key for authentication
        collection_name (str): Name of the collection to export
        batch_size (int): Number of objects to fetch per batch
    """
    client = None
    try:
        # Initialize the client
        client = weaviate.connect_to_weaviate_cloud(
            cluster_url=wcd_url,
            auth_credentials=Auth.api_key(api_key),
        )

        # Get the collection
        collection = client.collections.get(collection_name)

        all_data = []
        offset = 0

        while True:
            # Fetch objects in batches
            query_result = collection.query.fetch_objects(
                limit=batch_size,
                offset=offset
            )

            if not hasattr(query_result, 'objects') or not query_result.objects:
                break

            objects = query_result.objects

            # Convert objects to dictionaries and add to the list
            batch_data = [
                {
                    "uuid": str(obj.uuid),  # Convert UUID to string
                    **{k: str(v) if hasattr(v, '__str__') and not isinstance(v,
                                                                             (str, int, float, bool, list, dict)) else v
                       for k, v in obj.properties.items()}
                }
                for obj in objects
            ]

            all_data.extend(batch_data)

            # Print progress
            print(f"Exported {len(all_data)} objects so far...")

            if len(objects) < batch_size:
                break

            offset += batch_size

        if all_data:
            # Save to file using the custom encoder
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(all_data, f, indent=2, ensure_ascii=False, cls=WeaviateJSONEncoder)

            print(f"\nSuccessfully exported {len(all_data)} objects to {output_file}")
        else:
            print("No data found to export")

    except Exception as e:
        print(f"Error during export: {str(e)}")
        raise

    finally:
        if client is not None:
            client.close()


def inspect_weaviate_data(
        wcd_url: str = 'https://xh1j9trzu5cervreztxw.c0.europe-west3.gcp.weaviate.cloud',
        api_key: str = 'uTTyayyrfwyn98zBq6ukAcIAVnEJjkBWMLac',
        collection_name: str = collection_name,
        limit: Optional[int] = None
):
    """
    Inspect the contents of a Weaviate collection.
    """
    client = None
    try:
        # Initialize the client
        client = weaviate.connect_to_weaviate_cloud(
            cluster_url=wcd_url,
            auth_credentials=Auth.api_key(api_key),
        )

        # Get the collection
        collection = client.collections.get(collection_name)

        # Get collection info
        print("\n=== Collection Information ===")
        print(f"Collection name: {collection_name}")

        # Get collection configuration
        config = collection.config.get()
        print("\nCollection Configuration:")
        print(f"Description: {getattr(config, 'description', 'N/A')}")
        print(f"Vectorizer: {getattr(config, 'vectorizer', 'N/A')}")
        print(f"Vector index type: {getattr(config, 'vector_index_type', 'N/A')}")

        try:
            # Fetch objects
            query_result = collection.query.fetch_objects(
                limit=limit if limit is not None else 100
            )

            if hasattr(query_result, 'objects'):
                objects = query_result.objects
                total_objects = len(objects)

                print(f"\nRetrieved {total_objects} objects")

                if total_objects > 0:
                    print("\n=== Sample Data ===")
                    for i, obj in enumerate(objects, 1):
                        print(f"\nObject {i}:")
                        print(f"UUID: {str(obj.uuid)}")  # Convert UUID to string

                        # Print all properties
                        for prop_name, value in obj.properties.items():
                            # Convert any special types to strings
                            if hasattr(value, '__str__') and not isinstance(value, (str, int, float, bool, list, dict)):
                                value = str(value)

                            # If value is too long, truncate it
                            if isinstance(value, str) and len(value) > 100:
                                print(f"{prop_name}: {value[:100]}...")
                            else:
                                print(f"{prop_name}: {value}")

                        if i % 5 == 0:  # Add a separator every 5 objects
                            print("\n" + "-" * 50)

                    # Print summary statistics
                    print("\n=== Summary Statistics ===")
                    all_props = set()
                    for obj in objects:
                        all_props.update(obj.properties.keys())

                    prop_stats = {prop: sum(1 for obj in objects if prop in obj.properties) for prop in all_props}

                    print("\nProperty coverage:")
                    for prop, count in sorted(prop_stats.items()):
                        percentage = (count / total_objects) * 100
                        print(f"{prop}: {count}/{total_objects} objects ({percentage:.1f}%)")
            else:
                print("No objects found in the collection")

        except Exception as e:
            print(f"Error fetching objects: {str(e)}")

    except Exception as e:
        print(f"Error during inspection: {str(e)}")
        raise

    finally:
        if client is not None:
            client.close()



In [47]:
export_weaviate_data('weaviate_export.json', batch_size=100)
# inspect_weaviate_data(limit=10)

Exported 100 objects so far...
Exported 200 objects so far...
Exported 300 objects so far...
Exported 400 objects so far...
Exported 500 objects so far...
Exported 600 objects so far...
Exported 700 objects so far...
Exported 800 objects so far...
Exported 900 objects so far...
Exported 1000 objects so far...
Exported 1100 objects so far...
Exported 1200 objects so far...
Exported 1254 objects so far...

Successfully exported 1254 objects to weaviate_export.json


In [15]:
section_paths = [
    {
        'chapter' : 'I SKYRIUS',
        'chapter_title' : 'I BENDROSIOS NUOSTATOS',
        'article' : '1 straipsnis',
        'article_title' : 'Įstatymo paskirtis ir taikymo sritis',
        'paragraph_number' : '1.',
        'content' : 'Lietuvos Respublikos teritorija (toliau – Lietuva) – Lietuvos Respublikos teritorija ir greta Lietuvos Respublikos teritorinių vandenų esantis plotas, kuriame pagal Lietuvos Respublikos įstatymus ir tarptautinę teisę Lietuvos Respublika turi teisę tyrinėti ir eksploatuoti jūros dugno ir požeminius gamtos išteklius.',
    },
    {
        'chapter' : 'I SKYRIUS',
        'chapter_title' : 'I BENDROSIOS NUOSTATOS',
        'article' : '1 straipsnis',
        'article_title' : 'Įstatymo paskirtis ir taikymo sritis',
        'paragraph_number' : '2.',
        'content' : 'Gyventojas – nuolatinis ir nenuolatinis Lietuvos gyventojas.',
    },
    {
        'chapter' : 'I SKYRIUS',
        'chapter_title' : 'I BENDROSIOS NUOSTATOS',
        'article' : '1 straipsnis',
        'article_title' : 'Įstatymo paskirtis ir taikymo sritis',
        'paragraph_number' : '3.',
        'content' : 'Nuolatinis Lietuvos gyventojas – fizinis asmuo, kuris laikomas nuolatiniu Lietuvos gyventoju pagal šio Įstatymo 4 straipsnio nuostatas.',
    },
]