In [10]:
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.backends import default_backend
import os
import random
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import hashlib

def generate_encryption_dataset(cipher_type='AES', num_samples=10_000_000, batch_size=100_000):
    """
    Generates a large-scale encryption dataset with cryptographic integrity checks
    
    Args:
        cipher_type: 'AES' or 'ChaCha20'
        num_samples: Total number of samples to generate
        batch_size: Number of samples per write batch
    
    Output:
        encryption_dataset.parquet with schema:
        - plaintext: bytes (16-256 bytes)
        - ciphertext: bytes
        - key: bytes
        - nonce: bytes (ChaCha20 only)
    """
    # Dataset metadata
    metadata = {
        'algorithm': cipher_type,
        'creation_date': datetime.utcnow().isoformat(),
        'total_samples': str(num_samples),
        'version': '1.1',
        'author': 'Genetic Programming Cryptanalysis Suite',
        'security_note': 'FOR RESEARCH USE ONLY - UNSAFE FOR PRODUCTION'
    }

    # Initialize schema based on cipher type
    columns = ['plaintext', 'ciphertext', 'key']
    if cipher_type == 'ChaCha20':
        columns.append('nonce')

    # Create initial Parquet file with metadata
    pd.DataFrame(columns=columns).to_parquet(
        'encryption_dataset.parquet',
        engine='pyarrow',
        compression=None,
        custom_metadata=metadata
    )

    # Generate data in memory-efficient batches
    with tqdm(total=num_samples, desc='Generating Dataset') as pbar:
        for batch_idx in range(0, num_samples, batch_size):
            current_batch_size = min(batch_size, num_samples - batch_idx)
            batch_data = {col: [] for col in columns}
            
            for _ in range(current_batch_size):
                # Generate random plaintext (16-256 bytes)
                pt_length = random.randint(16, 256)
                pt = os.urandom(pt_length)
                key = os.urandom(16 if cipher_type == 'AES' else 8)
                
                # Encrypt with selected algorithm
                if cipher_type == 'AES':
                    cipher = Cipher(algorithms.AES(key), modes.ECB(), backend=default_backend())
                    nonce = None
                elif cipher_type == 'ChaCha20':
                    nonce = os.urandom(16)
                    cipher = Cipher(algorithms.ChaCha20(key, nonce), mode=None, backend=default_backend())
                
                encryptor = cipher.encryptor()
                ct = encryptor.update(pt) + encryptor.finalize()
                
                # Store results
                batch_data['plaintext'].append(pt)
                batch_data['ciphertext'].append(ct)
                batch_data['key'].append(key)
                if cipher_type == 'ChaCha20':
                    batch_data['nonce'].append(nce)
            
            # Create batch dataframe
            df = pd.DataFrame(batch_data)
            
            # Add batch integrity check
            batch_hash = hashlib.sha3_256(
                b''.join(df['ciphertext'] + df['key'])
            ).hexdigest()
            df.attrs['batch_hash'] = batch_hash
            
            # Append to Parquet file
            df.to_parquet(
                'encryption_dataset.parquet',
                engine='pyarrow',
                compression='zstd',
                index=False,
                append=True,
                existing_metadata='update'
            )
            
            pbar.update(current_batch_size)

if __name__ == '__main__':
    # Example usage with validation
    generate_encryption_dataset(
        cipher_type='AES',
        num_samples=10_000_000,
        batch_size=100_000
    )

TypeError: __cinit__() got an unexpected keyword argument 'custom_metadata'