In [None]:
import os
import uuid
import csv
from pydub import AudioSegment
from pydub.silence import split_on_silence

def dynamic_silence_thresh(audio_segment, target_dbfs=-40):
    """
    Calculate a dynamic silence threshold based on the target decibels relative to full scale (dBFS).
    
    Args:
        audio_segment (AudioSegment): The audio segment to calculate the silence threshold for.
        target_dbfs (int): The target dBFS for determining what is considered silence.
    
    Returns:
        int: The dynamic silence threshold.
    """
    average_dbfs = audio_segment.dBFS
    return max(target_dbfs, average_dbfs - 10)

def chunk_audio(input_file, 
                min_duration=6000, 
                max_duration=18000, 
                target_dbfs=-40,
                keep_silence=500, 
                overlap=0):
    """
    Chunks the audio based on silence and constraints on duration.

    Args:
        input_file (str): Path to the input audio file.
        min_duration (int): Minimum duration of chunks in milliseconds.
        max_duration (int): Maximum duration of chunks in milliseconds.
        target_dbfs (int): Target dBFS for silence threshold (default is -40).
        keep_silence (int): Amount of silence to keep at the beginning and end of each chunk (default is 500ms).
        overlap (int): Overlap in milliseconds between chunks to prevent abrupt cuts (default is 0ms).

    Returns:
        list: A list of audio chunks.
    """
    try:
        # Load the audio file
        audio = AudioSegment.from_file(input_file)
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return []

    # Calculate a dynamic silence threshold
    silence_thresh = dynamic_silence_thresh(audio, target_dbfs)

    # Split the audio based on dynamic silence
    chunks = split_on_silence(audio, 
                              min_silence_len=500, 
                              silence_thresh=silence_thresh, 
                              keep_silence=keep_silence)

    output_chunks = []
    current_chunk = AudioSegment.empty()
    
    for chunk in chunks:
        if len(current_chunk) + len(chunk) - overlap <= max_duration:
            current_chunk += chunk
        else:
            if len(current_chunk) >= min_duration:
                output_chunks.append(current_chunk)
            current_chunk = chunk
    
    # Add the last chunk if it meets the requirements
    if len(current_chunk) >= min_duration:
        output_chunks.append(current_chunk)
    
    # Handle the case where the last chunk doesn't meet the min duration
    if len(current_chunk) < min_duration and output_chunks:
        output_chunks[-1] += current_chunk

    # Trim the overlap from chunks if overlap is specified
    if overlap > 0:
        output_chunks = [chunk[:-overlap] for chunk in output_chunks[:-1]] + [output_chunks[-1]]

    return output_chunks

def read_existing_uuids(csv_file_path):
    """
    Read existing UUIDs from the CSV file to avoid reprocessing files.

    Args:
        csv_file_path (str): Path to the CSV file.

    Returns:
        set: A set of existing UUIDs.
    """
    existing_uuids = set()
    
    # Check if the CSV file exists
    if os.path.isfile(csv_file_path):
        with open(csv_file_path, mode='r', newline='') as csv_file:
            csv_reader = csv.reader(csv_file)
            next(csv_reader, None)  # Skip the header
            for row in csv_reader:
                if len(row) >= 2:
                    existing_uuids.add(row[1])
    
    return existing_uuids

def save_uuid_to_csv(file_path, filename, file_uuid):
    """
    Save the filename and UUID to a CSV file, adding a header if the file does not exist.

    Args:
        file_path (str): Path to the CSV file.
        filename (str): The filename to save.
        file_uuid (str): The UUID to save.
    """
    # Check if the file already exists to add headers if it doesn't
    file_exists = os.path.isfile(file_path)
    
    # Open the CSV file in append mode
    with open(file_path, mode='a', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        # Write headers if file doesn't exist
        if not file_exists:
            csv_writer.writerow(['Filename', 'UUID'])
        # Write the filename and UUID
        csv_writer.writerow([filename, file_uuid])

def process_directory(input_directory, output_base_directory, csv_file_path):
    """
    Process all audio files in the input directory, chunking them and saving to UUID-named folders.

    Args:
        input_directory (str): Path to the input directory containing audio files.
        output_base_directory (str): Path to the base directory for output.
        csv_file_path (str): Path to the CSV file for saving filename-UUID mappings.
    """
    # Read existing UUIDs to avoid reprocessing
    existing_uuids = read_existing_uuids(csv_file_path)

    # Create the "output_chunk" folder within the specified base directory
    base_output_directory = os.path.join(output_base_directory, 'output_chunk')
    os.makedirs(base_output_directory, exist_ok=True)

    # Iterate through all files in the input directory
    for filename in os.listdir(input_directory):
        input_file = os.path.join(input_directory, filename)
        
        # Skip if it's not a file
        if not os.path.isfile(input_file):
            continue

        # Generate a UUID for the input audio file
        file_uuid = str(uuid.uuid4())

        # Check if the UUID directory already exists
        uuid_directory = os.path.join(base_output_directory, file_uuid)
        if file_uuid in existing_uuids:
            print(f"Skipping {filename} as it has already been processed with UUID {file_uuid}.")
            continue

        # Create a directory named after the UUID inside the "output_chunk" folder
        os.makedirs(uuid_directory, exist_ok=True)

        # Save filename and UUID to CSV
        save_uuid_to_csv(csv_file_path, filename, file_uuid)

        # Chunk the audio file
        chunks = chunk_audio(input_file, min_duration=6000, max_duration=18000, target_dbfs=-40, keep_silence=500, overlap=1000)

        # Save chunks to the UUID-named directory within "output_chunk"
        for i, chunk in enumerate(chunks):
            chunk_filename = os.path.join(uuid_directory, f"chunk_{i+1}.mp3")
            chunk.export(chunk_filename, format="mp3")
            print(f"Saved {chunk_filename}, duration: {len(chunk) / 1000:.2f} seconds")

        print(f"All chunks for {filename} have been saved in the directory: {uuid_directory}")
        print(f"File UUID has been saved in: {csv_file_path}")


input_directory = '/home/oem/wiseyak_backup/binit/output' 
output_base_directory = '/home/oem/wiseyak_backup/binit/' 
csv_file_path = '/home/oem/wiseyak_backup/binit/file_uuids.csv' 

# Process all files in the input directory
process_directory(input_directory, output_base_directory, csv_file_path)
