# Configuration
Define imports and configuration parameters: root directory, output directory, optional file filter, and voice model selection.

In [None]:
import boto3
import hashlib
import string
import os
import json
import re
# Specify a single file to process or None for all
filter_file_name = 'Mindbreak'  # without extension
voice_model = 'neural'  # 'standard' or 'neural'
# Determine voice ID and engine based on model selection
voice_id = 'Salli'
engine = 'neural' if voice_model.lower() == 'neural' else 'standard'  # neural engine only in supported regions
# Specify AWS region for Polly (must support neural engine)
region_name = 'us-east-1'  # change as needed
# Set directories
root_dir = '../hypnosis'  # base folder for JSON files
audio_dir = '../audio'

# Audio Generation Process
Initialize AWS Polly, create output directory, load processed lines, filter JSON files, and generate MP3 using the selected voice model.

In [None]:
polly = boto3.client('polly', region_name=region_name)

os.makedirs(audio_dir, exist_ok=True)

processed_lines = set()
# Include existing mp3 files in processed_lines
for filename in os.listdir(audio_dir):
    if filename.endswith('.mp3'):
        file_root = os.path.splitext(filename)[0]  # Remove file extension
        processed_lines.add(file_root)

for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.json'):
            # Skip files not matching filter
            if filter_file_name and os.path.splitext(file)[0] != filter_file_name:
                continue
            new_entries = 0
            file_path = os.path.join(subdir, file)
            theme = os.path.splitext(file)[0]
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    for entry in data:
                        # use polly_text override if provided, else fall back to display line
                        text_to_say = entry.get('polly_text', entry.get('line', ''))
                        # normalize for hashing/display
                        line = text_to_say
                        # Normalize the line to create a unique key
                        line_key = line.translate(str.maketrans('', '', string.punctuation)).replace(' ', '').lower()
                        line_hash = hashlib.sha256(line_key.encode('utf-8')).hexdigest()
                        if line_hash in processed_lines:
                            continue
                        response = polly.synthesize_speech(
                            Text=text_to_say,
                            OutputFormat='mp3',
                            VoiceId=voice_id,
                            Engine=engine
                        )
                        new_entries += 1
                        audio_file_path = os.path.join(audio_dir, f"{line_hash}.mp3")
                        with open(audio_file_path, 'wb') as audio_file:
                            audio_file.write(response['AudioStream'].read())
                        processed_lines.add(line_hash)
                if new_entries:
                    print(f"Processed: {theme} with {new_entries} new entries")
            except Exception as e:
                print(f"Error processing file {file}: {e}")

# File Consistency Check

This cell reviews all JSON entries and compares the generated audio file hashes in the audio directory. It identifies any audio files that do not correspond to an entry in the JSON files and also flags any entries that are missing an associated audio file.

In [None]:
orphaned_lines = []

for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(subdir, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    for entry in data:
                        # check override as well
                        line = entry.get('polly_text', entry.get('line', ''))
                        if not line:
                            orphaned_lines.append({'file': file, 'entry': entry, 'reason': 'Missing line'})
                            continue
                        # Generate line hash as before
                        line_key = line.translate(str.maketrans('', '', string.punctuation)).replace(' ', '').lower()
                        line_hash = hashlib.sha256(line_key.encode('utf-8')).hexdigest()
                        if line_hash not in processed_lines:
                            orphaned_lines.append({'file': file, 'line': line, 'reason': 'No audio associated'})
            except Exception as e:
                print(f"Error processing file {file}: {e}")

if orphaned_lines:
    print("The following entries are orphaned:")
    for orphan in orphaned_lines:
        print(f"File: {orphan['file']}, Line: {orphan.get('line', '')}, Reason: {orphan['reason']}")
else:
    print("No orphaned or abandoned lines found.")