# Data Management
> Core functionality for data ingestion, hashing, and quality checking.

This module handles:
- Data ingestion and storage
- Content hashing and ID generation
- Basic quality checks
- Content provenance tracking


In [None]:
#| default_exp data
#| export
from fastcore.basics import *
from fastcore.test import *
import hashlib
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import json
import numpy as np
from datetime import datetime


In [None]:
#| export
class ContentStore:
    """Manages storage and retrieval of training data content"""
    def __init__(self, store_path: Path):
        self.store_path = Path(store_path)
        self.store_path.mkdir(exist_ok=True)
        self.index_file = self.store_path/'index.json'
        self._load_index()
    
    def _load_index(self):
        """Load or initialize the content index"""
        if self.index_file.exists():
            self.index = json.loads(self.index_file.read_text())
        else:
            self.index = {}
            self._save_index()
    
    def _save_index(self):
        """Save the current index to disk"""
        self.index_file.write_text(json.dumps(self.index, indent=2))
    
    def add_content(self, content: str, contributor_id: str) -> str:
        """
        Add new content to the store
        Returns: content_id (hash)
        """
        content_id = hashlib.sha256(content.encode()).hexdigest()
        if content_id in self.index:
            return content_id
        
        timestamp = datetime.now().isoformat()
        self.index[content_id] = {
            'contributor_id': contributor_id,
            'timestamp': timestamp,
            'size': len(content)
        }
        
        (self.store_path/content_id).write_text(content)
        self._save_index()
        return content_id
    
    def get_content(self, content_id: str) -> Optional[str]:
        """Retrieve content by ID"""
        if content_id not in self.index:
            return None
        return (self.store_path/content_id).read_text()


In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| test
@patch
def cleanup(self:ContentStore):
    "Remove all files in the store"
    if self.store_path.exists():
        for f in self.store_path.glob('*'):
            f.unlink()
        self.store_path.rmdir()

def test_content_store():
    # Setup
    store = ContentStore(Path('test_store'))
    
    # Test adding content
    content = "This is a test piece of content"
    contributor = "test_user_1"
    content_id = store.add_content(content, contributor)
    
    # Test content exists and can be retrieved
    test_eq(store.get_content(content_id), content)
    
    # Test index contains correct metadata
    assert content_id in store.index
    assert store.index[content_id]['contributor_id'] == contributor
    assert store.index[content_id]['size'] == len(content)
    
    # Test duplicate content returns same ID
    duplicate_id = store.add_content(content, "test_user_2")
    test_eq(duplicate_id, content_id)
    
    # Test nonexistent content returns None
    test_eq(store.get_content("nonexistent"), None)
    
    # Cleanup
    store.cleanup()

In [None]:
#| example
store = ContentStore(Path('example_store'))

# Adding new content
print("Adding content from different contributors:")
content1 = "The quick brown fox jumps over the lazy dog."
content2 = "Another piece of training data for our model."

id1 = store.add_content(content1, "alice")
print(f"Content 1 ID: {id1[:8]}... (from alice)")
print(f"Stored metadata: {store.index[id1]}")

id2 = store.add_content(content2, "bob")
print(f"\nContent 2 ID: {id2[:8]}... (from bob)")

# Retrieving content
print("\nRetrieving content:")
retrieved = store.get_content(id1)
print(f"Retrieved content 1: '{retrieved}'")

# Demonstrating deduplication
print("\nTesting deduplication:")
duplicate_id = store.add_content(content1, "charlie")
print(f"Original ID    : {id1[:8]}...")
print(f"Duplicate ID   : {duplicate_id[:8]}...")
print(f"IDs match      : {id1 == duplicate_id}")

# Cleanup
store.cleanup()

Adding content from different contributors:
Content 1 ID: ef537f25... (from alice)
Stored metadata: {'contributor_id': 'alice', 'timestamp': '2025-02-16T21:17:13.047794', 'size': 44}

Content 2 ID: 7f30cb1f... (from bob)

Retrieving content:
Retrieved content 1: 'The quick brown fox jumps over the lazy dog.'

Testing deduplication:
Original ID    : ef537f25...
Duplicate ID   : ef537f25...
IDs match      : True


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()