In [None]:
from unsloth import FastModel
import torch

dtype = None
load_in_4bit = False

max_seq_length = 2048

model, tokenizer = FastModel.from_pretrained(
    model_name = "OuteAI/Llama-OuteTTS-1.0-1B",
    max_seq_length= max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
    )

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


In [None]:
model = FastModel.get_peft_model(
    model,
    r = 128,
    target_modules = ["q_proj", "v_proj",],
    lora_alpha = 128,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [None]:
from datasets import load_dataset,Audio,Dataset
dataset = load_dataset("MrDragonFox/Elise", split = "train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=24000))


In [None]:
#@title Tokenisation Function

import torch
from tqdm import tqdm
import io
import tempfile
from datasets import Dataset
import sys
sys.path.append('OuteTTS')
import os
import dac
# V3 Imports
from outetts.version.v3.audio_processor import AudioProcessor
from outetts.version.v3.prompt_processor import PromptProcessor
from outetts.dac.interface import DacInterface
from outetts.models.config import ModelConfig # Need a dummy config for AudioProcessor
import whisper
from outetts.utils.preprocessing import text_normalizations
import soundfile as sf
import numpy as np

class DataCreationV3:
    def __init__(
            self,
            model_tokenizer_path: str,
            whisper_model_name: str = "turbo",
            device: str = None
        ):

        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        # Create a dummy ModelConfig mainly for device and paths needed by AudioProcessor/DacInterface
        dummy_config = ModelConfig(
            tokenizer_path=model_tokenizer_path,
            device=self.device,
            audio_codec_path=None # Let AudioProcessor use default DAC path
        )
        self.audio_processor = AudioProcessor(config=dummy_config)
        self.prompt_processor = PromptProcessor(model_tokenizer_path)

        print(f"Loading Whisper model: {whisper_model_name} on {self.device}")
        self.whisper_model = whisper.load_model(whisper_model_name, device=self.device)
        print("Whisper model loaded.")

    # Renamed and adapted from the previous version
    def create_speaker_representation(self, audio_bytes: bytes, transcript: str):
        """
        Creates a v3-compatible speaker dictionary using Whisper and AudioProcessor.
        """
        if not audio_bytes or not transcript:
             print("Missing audio bytes or transcript in create_speaker_representation.")
             return None

        # Whisper needs a file path, so save bytes to a temporary file
        try:
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as tmp_audio_file:
                tmp_audio_file.write(audio_bytes)
                tmp_audio_file.flush() # Ensure data is written

                # 1. Get word timings using Whisper
                whisper_result = self.whisper_model.transcribe(tmp_audio_file.name, word_timestamps=True)
                # Use the provided transcript for consistency, but Whisper timings
                normalized_transcript = text_normalizations(transcript)

                words_with_timings = []
                if whisper_result and 'segments' in whisper_result:
                    for segment in whisper_result['segments']:
                        if 'words' in segment:
                            for word_info in segment['words']:
                                # Use original word casing/punctuation from Whisper's output if needed,
                                # but strip excess whitespace for consistency.
                                cleaned_word = word_info['word'].strip()
                                if cleaned_word: # Ignore empty strings
                                    words_with_timings.append({
                                        'word': cleaned_word,
                                        'start': float(word_info['start']),
                                        'end': float(word_info['end'])
                                    })
                else:
                    print(f"Whisper did not return segments/words for: {transcript[:50]}...")
                    return None # Indicate failure

                if not words_with_timings:
                    print(f"No word timings extracted by Whisper for: {transcript[:50]}...")
                    return None

                # Prepare data dict for AudioProcessor
                speaker_data_dict = {
                    "audio": {"bytes": audio_bytes},
                    "text": normalized_transcript, # Use the potentially normalized transcript
                    "words": words_with_timings
                }

                # 2. Use AudioProcessor to create the speaker representation
                v3_speaker = self.audio_processor.create_speaker_from_dict(speaker_data_dict)
                return v3_speaker

        except Exception as e:
            print(f"Error during speaker creation (Whisper/AudioProcessor): {e}")
            return None # Indicate failure


    # --- V3 Changes: run method is now a generator ---
    def process_dataset(self, dataset: Dataset):
        """
        Processes a Hugging Face Dataset object in memory and yields training prompts.

        Args:
            dataset (Dataset): The Hugging Face dataset to process.
                               Expected columns: 'text' (str) and 'audio' (dict with 'bytes').

        Yields:
            str: The processed training prompt string for each valid row.
        """
        processed_count = 0
        skipped_count = 0

        # Iterate directly over the dataset
        for i, item in enumerate(tqdm(dataset, desc="Processing Dataset")):
            try:
                # --- Adapt to your dataset's column names ---
                transcript = item.get('text')
                audio_info = item.get('audio')
                # --- End Adapt ---

                if not transcript or not isinstance(transcript, str):
                    print(f"Row {i}: Skipping due to missing or invalid 'text' column.")
                    skipped_count += 1
                    continue

                audio_array = audio_info['array']
                buffer = io.BytesIO()
                # Ensure array is float32 for common compatibility, adjust subtype if needed
                sf.write(buffer, audio_array.astype(np.float32), audio_info['sampling_rate'], format='WAV', subtype='FLOAT')
                buffer.seek(0)
                audio_bytes = buffer.getvalue()

                # Create speaker representation
                speaker = self.create_speaker_representation(audio_bytes, transcript)

                if speaker is None:
                    print(f"Row {i}: Failed to create speaker representation for text: {transcript[:50]}... Skipping.")
                    skipped_count += 1
                    continue

                # Get the V3 training prompt
                prompt = self.prompt_processor.get_training_prompt(speaker)

                processed_count += 1
                yield prompt # Yield the processed prompt string

            except KeyboardInterrupt:
                 print("Processing interrupted by user.")
                 break
            except Exception as e:
                print(f"Row {i}: Unhandled error processing item: {e}", exc_info=True)
                skipped_count += 1
                # Decide if you want to stop on errors or just skip
                continue

        print(f"Dataset processing finished. Processed: {processed_count}, Skipped: {skipped_count}")

if __name__ == "__main__":

    _MODEL_TOKENIZER_PATH = "OuteAI/Llama-OuteTTS-1.0-1B"
    _WHISPER_MODEL = "turbo" # Or "small.en", "medium.en", "large-v2", etc.


    data_processor = DataCreationV3(
        model_tokenizer_path=_MODEL_TOKENIZER_PATH,
        whisper_model_name=_WHISPER_MODEL
    )

    # Process the dataset and collect prompts (or process iteratively)
    all_prompts = []
    print("Starting dataset processing...")
    procced_dataset = data_processor.process_dataset(dataset)
    for prompt in procced_dataset:
        if prompt:
             all_prompts.append({'text': prompt})
    dataset = Dataset.from_list(all_prompts)
    print("Moving Whisper model to CPU")
    data_processor.whisper_model.to('cpu')
    torch.cuda.empty_cache()


Using device: cuda


weights_24khz_1.5kbps_v1.0.pth:   0%|          | 0.00/296M [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)


Loading Whisper model: turbo on cuda


100%|█████████████████████████████████████| 1.51G/1.51G [00:22<00:00, 73.4MiB/s]


Whisper model loaded.
Starting dataset processing...



  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:01<00:03,  1.56s/it][A
100%|██████████| 3/3 [00:01<00:00,  1.63it/s]

100%|██████████| 3/3 [00:00<00:00, 99.34it/s]

100%|██████████| 2/2 [00:00<00:00, 50.44it/s]
Processing Dataset:   0%|          | 3/1195 [00:13<1:09:57,  3.52s/it]
100%|██████████| 2/2 [00:00<00:00, 24.42it/s]
Processing Dataset:   0%|          | 4/1195 [00:14<52:52,  2.66s/it]  
100%|██████████| 2/2 [00:00<00:00, 51.09it/s]

100%|██████████| 2/2 [00:00<00:00, 52.35it/s]

100%|██████████| 2/2 [00:00<00:00, 33.26it/s]
Processing Dataset:   1%|          | 7/1195 [00:20<41:09,  2.08s/it]
100%|██████████| 2/2 [00:00<00:00, 52.53it/s]
Processing Dataset:   1%|          | 8/1195 [00:21<38:19,  1.94s/it]
100%|██████████| 2/2 [00:00<00:00, 38.22it/s]

100%|██████████| 2/2 [00:00<00:00, 52.67it/s]
Processing Dataset:   1%|          | 10/1195 [00:23<29:13,  1.48s/it]
100%|██████████| 2/2 [00:00<00:00, 20.56it/s]
Processing Dataset:   1%|          | 11/1195 [00

Row 820: Skipping due to missing or invalid 'text' column.



100%|██████████| 2/2 [00:00<00:00, 37.86it/s]
Processing Dataset:  69%|██████▉   | 822/1195 [16:49<06:48,  1.10s/it]
100%|██████████| 3/3 [00:00<00:00, 83.24it/s]
Processing Dataset:  69%|██████▉   | 823/1195 [16:51<07:17,  1.18s/it]
100%|██████████| 1/1 [00:00<00:00, 79.80it/s]
Processing Dataset:  69%|██████▉   | 824/1195 [16:52<07:03,  1.14s/it]
100%|██████████| 2/2 [00:00<00:00, 82.46it/s]

100%|██████████| 3/3 [00:00<00:00, 86.50it/s]

100%|██████████| 2/2 [00:00<00:00, 79.03it/s]

100%|██████████| 3/3 [00:00<00:00, 84.50it/s]
Processing Dataset:  69%|██████▉   | 828/1195 [16:56<07:20,  1.20s/it]
100%|██████████| 3/3 [00:00<00:00, 86.28it/s]
Processing Dataset:  69%|██████▉   | 829/1195 [16:58<07:17,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 82.93it/s]
Processing Dataset:  69%|██████▉   | 830/1195 [16:59<07:14,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 34.84it/s]
Processing Dataset:  70%|██████▉   | 831/1195 [17:00<07:14,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 55.8

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 918: Failed to create speaker representation for text: Not someone, not a- not a new version of you that'... Skipping.



100%|██████████| 2/2 [00:00<00:00, 77.12it/s]

100%|██████████| 2/2 [00:00<00:00, 82.52it/s]
Processing Dataset:  77%|███████▋  | 921/1195 [18:47<05:34,  1.22s/it]
100%|██████████| 2/2 [00:00<00:00, 83.28it/s]
Processing Dataset:  77%|███████▋  | 922/1195 [18:48<05:28,  1.20s/it]
100%|██████████| 1/1 [00:00<00:00, 78.82it/s]
Processing Dataset:  77%|███████▋  | 923/1195 [18:49<05:10,  1.14s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 922: Failed to create speaker representation for text: Why you stay close to me whenever we go to the mal... Skipping.



100%|██████████| 2/2 [00:00<00:00, 53.23it/s]

100%|██████████| 1/1 [00:00<00:00, 63.92it/s]
Processing Dataset:  77%|███████▋  | 925/1195 [18:51<05:09,  1.15s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 924: Failed to create speaker representation for text: Oh, I can? I can come sit in the bathroom while yo... Skipping.



100%|██████████| 3/3 [00:00<00:00, 87.71it/s]

100%|██████████| 2/2 [00:00<00:00, 47.05it/s]
Processing Dataset:  78%|███████▊  | 927/1195 [18:54<05:22,  1.20s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 926: Failed to create speaker representation for text: I, you just... I don't know. Y- you seemed so extr... Skipping.



100%|██████████| 2/2 [00:00<00:00, 84.95it/s]

100%|██████████| 2/2 [00:00<00:00, 83.14it/s]
Processing Dataset:  78%|███████▊  | 929/1195 [18:56<05:13,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 89.80it/s]
Processing Dataset:  78%|███████▊  | 930/1195 [18:57<05:11,  1.17s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 929: Failed to create speaker representation for text: <sighs> Look, hiding my hoodie behind you doesn't ... Skipping.



100%|██████████| 2/2 [00:00<00:00, 84.52it/s]
Processing Dataset:  78%|███████▊  | 931/1195 [18:58<05:00,  1.14s/it]
100%|██████████| 3/3 [00:00<00:00, 73.46it/s]
Processing Dataset:  78%|███████▊  | 932/1195 [19:00<05:11,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 83.12it/s]
Processing Dataset:  78%|███████▊  | 933/1195 [19:01<05:04,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 33.37it/s]
Processing Dataset:  78%|███████▊  | 934/1195 [19:02<04:54,  1.13s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 933: Failed to create speaker representation for text: Are you feeling a little better now? <sighs> I'm r... Skipping.



100%|██████████| 2/2 [00:00<00:00, 63.39it/s]
Processing Dataset:  78%|███████▊  | 935/1195 [19:03<05:03,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 82.22it/s]
Processing Dataset:  78%|███████▊  | 936/1195 [19:04<05:07,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 83.62it/s]
Processing Dataset:  78%|███████▊  | 937/1195 [19:05<04:56,  1.15s/it]
100%|██████████| 2/2 [00:00<00:00, 82.71it/s]

100%|██████████| 2/2 [00:00<00:00, 67.13it/s]

100%|██████████| 3/3 [00:00<00:00, 88.65it/s]
Processing Dataset:  79%|███████▊  | 940/1195 [19:09<05:23,  1.27s/it]
100%|██████████| 2/2 [00:00<00:00, 87.92it/s]
Processing Dataset:  79%|███████▊  | 941/1195 [19:11<05:12,  1.23s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 940: Failed to create speaker representation for text: Well, a fair bit longer. In fact, I might go to sl... Skipping.



100%|██████████| 2/2 [00:00<00:00, 49.52it/s]
Processing Dataset:  79%|███████▉  | 942/1195 [19:12<04:57,  1.18s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 941: Failed to create speaker representation for text: I love you so, so much and I will never give you u... Skipping.



100%|██████████| 3/3 [00:00<00:00, 86.59it/s]
Processing Dataset:  79%|███████▉  | 943/1195 [19:13<04:55,  1.17s/it]
100%|██████████| 1/1 [00:00<00:00, 31.92it/s]
Processing Dataset:  79%|███████▉  | 944/1195 [19:14<04:42,  1.12s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 943: Failed to create speaker representation for text: That's— <sighs> fine. I will do this one time.... Skipping.



100%|██████████| 3/3 [00:00<00:00, 51.60it/s]

100%|██████████| 3/3 [00:00<00:00, 82.51it/s]

100%|██████████| 2/2 [00:00<00:00, 45.66it/s]
Processing Dataset:  79%|███████▉  | 947/1195 [19:18<05:02,  1.22s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 946: Failed to create speaker representation for text: Hey, can I tell you something?... Skipping.



100%|██████████| 3/3 [00:00<00:00, 79.52it/s]
Processing Dataset:  79%|███████▉  | 948/1195 [19:19<05:03,  1.23s/it]
100%|██████████| 2/2 [00:00<00:00, 85.73it/s]
Processing Dataset:  79%|███████▉  | 949/1195 [19:20<04:51,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 73.10it/s]
Processing Dataset:  79%|███████▉  | 950/1195 [19:21<04:44,  1.16s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 949: Failed to create speaker representation for text: What's that, cutie? Oh, you want to lie on my ches... Skipping.



100%|██████████| 2/2 [00:00<00:00, 76.64it/s]
Processing Dataset:  80%|███████▉  | 951/1195 [19:22<04:45,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 84.48it/s]

100%|██████████| 2/2 [00:00<00:00, 76.81it/s]

100%|██████████| 2/2 [00:00<00:00, 76.98it/s]

100%|██████████| 2/2 [00:00<00:00, 73.93it/s]

100%|██████████| 2/2 [00:00<00:00, 78.03it/s]
Processing Dataset:  80%|████████  | 956/1195 [19:28<04:52,  1.23s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 955: Failed to create speaker representation for text: So just tell me, what is it? I see. You're...you'r... Skipping.



100%|██████████| 2/2 [00:00<00:00, 82.01it/s]
Processing Dataset:  80%|████████  | 957/1195 [19:30<04:49,  1.22s/it]
100%|██████████| 3/3 [00:00<00:00, 85.16it/s]

100%|██████████| 3/3 [00:00<00:00, 87.47it/s]

100%|██████████| 2/2 [00:00<00:00, 69.02it/s]

100%|██████████| 2/2 [00:00<00:00, 73.47it/s]
Processing Dataset:  80%|████████  | 961/1195 [19:34<04:35,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 85.30it/s]
Processing Dataset:  81%|████████  | 962/1195 [19:36<04:33,  1.17s/it]
100%|██████████| 3/3 [00:00<00:00, 80.66it/s]
Processing Dataset:  81%|████████  | 963/1195 [19:37<04:33,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 83.33it/s]
Processing Dataset:  81%|████████  | 964/1195 [19:38<04:29,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 77.41it/s]
Processing Dataset:  81%|████████  | 965/1195 [19:39<04:27,  1.16s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 964: Failed to create speaker representation for text: Did you now? A dream about me? Well, how about thi... Skipping.



100%|██████████| 3/3 [00:00<00:00, 59.09it/s]
Processing Dataset:  81%|████████  | 966/1195 [19:40<04:43,  1.24s/it]
100%|██████████| 2/2 [00:00<00:00, 81.50it/s]
Processing Dataset:  81%|████████  | 967/1195 [19:42<04:44,  1.25s/it]
100%|██████████| 3/3 [00:00<00:00, 96.90it/s]

100%|██████████| 1/1 [00:00<00:00, 67.72it/s]

100%|██████████| 2/2 [00:00<00:00, 80.67it/s]
Processing Dataset:  81%|████████  | 970/1195 [19:45<04:22,  1.17s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 969: Failed to create speaker representation for text: <laughs> And besides, later we can take these clot... Skipping.



100%|██████████| 2/2 [00:00<00:00, 83.76it/s]
Processing Dataset:  81%|████████▏ | 971/1195 [19:46<04:26,  1.19s/it]
100%|██████████| 1/1 [00:00<00:00, 66.33it/s]
Processing Dataset:  81%|████████▏ | 972/1195 [19:47<04:15,  1.15s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 971: Failed to create speaker representation for text: Hey there, baby. I was starting to get worried. Um... Skipping.



100%|██████████| 2/2 [00:00<00:00, 86.63it/s]
Processing Dataset:  81%|████████▏ | 973/1195 [19:49<04:18,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 81.38it/s]
Processing Dataset:  82%|████████▏ | 974/1195 [19:50<04:24,  1.20s/it]
100%|██████████| 2/2 [00:00<00:00, 36.20it/s]
Processing Dataset:  82%|████████▏ | 975/1195 [19:51<04:30,  1.23s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 974: Failed to create speaker representation for text: No, well let's just say if you ended up here witho... Skipping.



100%|██████████| 2/2 [00:00<00:00, 71.75it/s]

100%|██████████| 2/2 [00:00<00:00, 77.94it/s]
Processing Dataset:  82%|████████▏ | 977/1195 [19:54<04:26,  1.22s/it]
100%|██████████| 2/2 [00:00<00:00, 47.77it/s]
Processing Dataset:  82%|████████▏ | 978/1195 [19:55<04:22,  1.21s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 977: Failed to create speaker representation for text: Yes, I know it tickles, but you look so cute when ... Skipping.



100%|██████████| 2/2 [00:00<00:00, 77.51it/s]
Processing Dataset:  82%|████████▏ | 979/1195 [19:56<04:20,  1.21s/it]
100%|██████████| 2/2 [00:00<00:00, 67.70it/s]

100%|██████████| 2/2 [00:00<00:00, 78.40it/s]
Processing Dataset:  82%|████████▏ | 981/1195 [19:58<04:13,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 84.98it/s]
Processing Dataset:  82%|████████▏ | 982/1195 [20:00<04:13,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 84.65it/s]
Processing Dataset:  82%|████████▏ | 983/1195 [20:01<04:05,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 62.78it/s]
Processing Dataset:  82%|████████▏ | 984/1195 [20:02<04:07,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 62.22it/s]
Processing Dataset:  82%|████████▏ | 985/1195 [20:03<04:12,  1.20s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 984: Failed to create speaker representation for text: And I know, I know that you know people have said ... Skipping.



100%|██████████| 2/2 [00:00<00:00, 49.31it/s]
Processing Dataset:  83%|████████▎ | 986/1195 [20:05<04:18,  1.24s/it]
100%|██████████| 2/2 [00:00<00:00, 82.92it/s]
Processing Dataset:  83%|████████▎ | 987/1195 [20:06<04:15,  1.23s/it]
100%|██████████| 2/2 [00:00<00:00, 84.70it/s]
Processing Dataset:  83%|████████▎ | 988/1195 [20:07<04:10,  1.21s/it]
100%|██████████| 2/2 [00:00<00:00, 86.00it/s]

100%|██████████| 3/3 [00:00<00:00, 64.82it/s]

100%|██████████| 2/2 [00:00<00:00, 75.20it/s]
Processing Dataset:  83%|████████▎ | 991/1195 [20:10<03:58,  1.17s/it]
100%|██████████| 1/1 [00:00<00:00, 63.78it/s]
Processing Dataset:  83%|████████▎ | 992/1195 [20:11<03:49,  1.13s/it]
100%|██████████| 2/2 [00:00<00:00, 77.64it/s]
Processing Dataset:  83%|████████▎ | 993/1195 [20:13<03:45,  1.12s/it]
100%|██████████| 2/2 [00:00<00:00, 81.34it/s]
Processing Dataset:  83%|████████▎ | 994/1195 [20:14<03:53,  1.16s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 993: Failed to create speaker representation for text: ...back of a taxi, and the film was nearly cancele... Skipping.



100%|██████████| 2/2 [00:00<00:00, 67.03it/s]
Processing Dataset:  83%|████████▎ | 995/1195 [20:15<03:55,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 74.98it/s]

100%|██████████| 2/2 [00:00<00:00, 65.18it/s]

100%|██████████| 2/2 [00:00<00:00, 77.52it/s]
Processing Dataset:  84%|████████▎ | 998/1195 [20:19<03:57,  1.20s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 997: Failed to create speaker representation for text: <sighs> Just like a statue. <giggles> Ah, but you ... Skipping.



100%|██████████| 2/2 [00:00<00:00, 80.43it/s]
Processing Dataset:  84%|████████▎ | 999/1195 [20:20<03:48,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 81.39it/s]
Processing Dataset:  84%|████████▎ | 1000/1195 [20:21<03:49,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 81.88it/s]

100%|██████████| 3/3 [00:00<00:00, 83.84it/s]
Processing Dataset:  84%|████████▍ | 1002/1195 [20:23<03:40,  1.14s/it]
100%|██████████| 2/2 [00:00<00:00, 82.60it/s]
Processing Dataset:  84%|████████▍ | 1003/1195 [20:24<03:35,  1.12s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1002: Failed to create speaker representation for text: Thank you. Now, doctor said it should only just be... Skipping.



100%|██████████| 3/3 [00:00<00:00, 78.13it/s]

100%|██████████| 2/2 [00:00<00:00, 47.65it/s]
Processing Dataset:  84%|████████▍ | 1005/1195 [20:27<03:36,  1.14s/it]
100%|██████████| 2/2 [00:00<00:00, 69.33it/s]
Processing Dataset:  84%|████████▍ | 1006/1195 [20:28<03:41,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 61.98it/s]

100%|██████████| 2/2 [00:00<00:00, 82.04it/s]
Processing Dataset:  84%|████████▍ | 1008/1195 [20:30<03:47,  1.21s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1007: Failed to create speaker representation for text: Yes, lamia, like snakes, do shed their skin. And w... Skipping.



100%|██████████| 2/2 [00:00<00:00, 47.36it/s]

100%|██████████| 2/2 [00:00<00:00, 44.22it/s]
Processing Dataset:  85%|████████▍ | 1010/1195 [20:33<03:35,  1.16s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1009: Failed to create speaker representation for text: Here. <moans> Ahh, mmm.... Skipping.



100%|██████████| 2/2 [00:00<00:00, 72.11it/s]
Processing Dataset:  85%|████████▍ | 1011/1195 [20:34<03:28,  1.14s/it]
100%|██████████| 2/2 [00:00<00:00, 83.88it/s]

100%|██████████| 2/2 [00:00<00:00, 78.69it/s]
Processing Dataset:  85%|████████▍ | 1013/1195 [20:36<03:23,  1.12s/it]
100%|██████████| 2/2 [00:00<00:00, 85.02it/s]
Processing Dataset:  85%|████████▍ | 1014/1195 [20:37<03:26,  1.14s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1013: Failed to create speaker representation for text: Okay, well...I'm sorry for bringing this up but, u... Skipping.



100%|██████████| 1/1 [00:00<00:00, 62.92it/s]

100%|██████████| 2/2 [00:00<00:00, 65.01it/s]

100%|██████████| 2/2 [00:00<00:00, 51.43it/s]
Processing Dataset:  85%|████████▌ | 1017/1195 [20:41<03:29,  1.18s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1016: Failed to create speaker representation for text: I mean, it is a romantic dinner and while the lady... Skipping.



100%|██████████| 2/2 [00:00<00:00, 85.17it/s]
Processing Dataset:  85%|████████▌ | 1018/1195 [20:42<03:36,  1.22s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1017: Failed to create speaker representation for text: Mm-hmm. Thank you. I love you. Can, can we maybe d... Skipping.



100%|██████████| 2/2 [00:00<00:00, 82.23it/s]
Processing Dataset:  85%|████████▌ | 1019/1195 [20:43<03:30,  1.20s/it]
100%|██████████| 2/2 [00:00<00:00, 75.42it/s]
Processing Dataset:  85%|████████▌ | 1020/1195 [20:44<03:22,  1.16s/it]
100%|██████████| 3/3 [00:00<00:00, 87.43it/s]
Processing Dataset:  85%|████████▌ | 1021/1195 [20:46<03:27,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 53.79it/s]

100%|██████████| 2/2 [00:00<00:00, 71.01it/s]
Processing Dataset:  86%|████████▌ | 1023/1195 [20:48<03:19,  1.16s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1022: Failed to create speaker representation for text: <sighs> Along with all of my powers. Oh, and don't... Skipping.



100%|██████████| 2/2 [00:00<00:00, 69.13it/s]
Processing Dataset:  86%|████████▌ | 1024/1195 [20:49<03:17,  1.15s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1023: Failed to create speaker representation for text: All right? Yeah, I know the perfect flavor for you... Skipping.



100%|██████████| 3/3 [00:00<00:00, 84.36it/s]

100%|██████████| 2/2 [00:00<00:00, 70.92it/s]
Processing Dataset:  86%|████████▌ | 1026/1195 [20:51<03:15,  1.16s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1025: Failed to create speaker representation for text: So, what happened with Bethany? It was an accident... Skipping.



100%|██████████| 2/2 [00:00<00:00, 72.46it/s]
Processing Dataset:  86%|████████▌ | 1027/1195 [20:53<03:18,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 54.92it/s]
Processing Dataset:  86%|████████▌ | 1028/1195 [20:54<03:16,  1.18s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1027: Failed to create speaker representation for text: <sighs> If- if you only knew how much I cared for ... Skipping.



100%|██████████| 1/1 [00:00<00:00, 67.33it/s]
Processing Dataset:  86%|████████▌ | 1029/1195 [20:55<03:05,  1.12s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1028: Failed to create speaker representation for text: Forged into the shape of the old Necrontyr gods.... Skipping.



100%|██████████| 2/2 [00:00<00:00, 73.12it/s]

100%|██████████| 2/2 [00:00<00:00, 75.02it/s]
Processing Dataset:  86%|████████▋ | 1031/1195 [20:57<03:15,  1.19s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1030: Failed to create speaker representation for text: <sighs> I know you're working, but y-you can take ... Skipping.



100%|██████████| 2/2 [00:00<00:00, 61.53it/s]

100%|██████████| 2/2 [00:00<00:00, 60.19it/s]
Processing Dataset:  86%|████████▋ | 1033/1195 [21:00<03:12,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 72.39it/s]
Processing Dataset:  87%|████████▋ | 1034/1195 [21:01<03:10,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 76.74it/s]

100%|██████████| 2/2 [00:00<00:00, 72.41it/s]
Processing Dataset:  87%|████████▋ | 1036/1195 [21:03<03:01,  1.14s/it]
100%|██████████| 2/2 [00:00<00:00, 49.23it/s]
Processing Dataset:  87%|████████▋ | 1037/1195 [21:04<03:05,  1.18s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1036: Failed to create speaker representation for text: I've been looking after you since you were born, m... Skipping.



100%|██████████| 2/2 [00:00<00:00, 67.87it/s]

100%|██████████| 2/2 [00:00<00:00, 71.10it/s]
Processing Dataset:  87%|████████▋ | 1039/1195 [21:07<03:08,  1.21s/it]
100%|██████████| 2/2 [00:00<00:00, 76.37it/s]
Processing Dataset:  87%|████████▋ | 1040/1195 [21:08<03:05,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 72.54it/s]
Processing Dataset:  87%|████████▋ | 1041/1195 [21:09<03:05,  1.21s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1040: Failed to create speaker representation for text: So do not be surprised if she has a total mood shi... Skipping.



100%|██████████| 2/2 [00:00<00:00, 75.35it/s]
Processing Dataset:  87%|████████▋ | 1042/1195 [21:10<03:01,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 76.91it/s]
Processing Dataset:  87%|████████▋ | 1043/1195 [21:11<02:59,  1.18s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1042: Failed to create speaker representation for text: I want to be someone that you can depend on. Okay?... Skipping.



100%|██████████| 2/2 [00:00<00:00, 76.83it/s]
Processing Dataset:  87%|████████▋ | 1044/1195 [21:12<02:54,  1.15s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1043: Failed to create speaker representation for text: Well, no. I mean, just so many things that you do ... Skipping.



100%|██████████| 2/2 [00:00<00:00, 68.51it/s]

100%|██████████| 3/3 [00:00<00:00, 73.78it/s]
Processing Dataset:  88%|████████▊ | 1046/1195 [21:15<03:03,  1.23s/it]
100%|██████████| 2/2 [00:00<00:00, 62.13it/s]
Processing Dataset:  88%|████████▊ | 1047/1195 [21:16<03:03,  1.24s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1046: Failed to create speaker representation for text: Okay, and the...Rosamari? Who the hell is Rosamari... Skipping.



100%|██████████| 2/2 [00:00<00:00, 43.56it/s]
Processing Dataset:  88%|████████▊ | 1048/1195 [21:18<03:06,  1.27s/it]
100%|██████████| 1/1 [00:00<00:00, 52.67it/s]
Processing Dataset:  88%|████████▊ | 1049/1195 [21:19<03:00,  1.24s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1048: Failed to create speaker representation for text: <moans>... Skipping.



100%|██████████| 2/2 [00:00<00:00, 52.77it/s]
Processing Dataset:  88%|████████▊ | 1050/1195 [21:20<02:50,  1.18s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1049: Failed to create speaker representation for text: ...anything. It doesn't matter how old you get, li... Skipping.



100%|██████████| 2/2 [00:00<00:00, 70.39it/s]
Processing Dataset:  88%|████████▊ | 1051/1195 [21:21<02:42,  1.13s/it]
100%|██████████| 2/2 [00:00<00:00, 86.93it/s]
Processing Dataset:  88%|████████▊ | 1052/1195 [21:22<02:47,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 71.22it/s]
Processing Dataset:  88%|████████▊ | 1053/1195 [21:23<02:45,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 75.98it/s]
Processing Dataset:  88%|████████▊ | 1054/1195 [21:24<02:43,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 76.15it/s]

100%|██████████| 3/3 [00:00<00:00, 76.54it/s]

100%|██████████| 2/2 [00:00<00:00, 65.16it/s]
Processing Dataset:  88%|████████▊ | 1057/1195 [21:28<02:46,  1.20s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1056: Failed to create speaker representation for text: Alrighty, up, up and away we go. Huh? What? Aw, di... Skipping.



100%|██████████| 2/2 [00:00<00:00, 63.35it/s]
Processing Dataset:  89%|████████▊ | 1058/1195 [21:29<02:42,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 64.87it/s]
Processing Dataset:  89%|████████▊ | 1059/1195 [21:31<02:47,  1.23s/it]
100%|██████████| 3/3 [00:00<00:00, 80.12it/s]

100%|██████████| 2/2 [00:00<00:00, 68.56it/s]
Processing Dataset:  89%|████████▉ | 1061/1195 [21:33<02:45,  1.23s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1060: Failed to create speaker representation for text: Not quite yet <laughs>. Ah, this is so great. Than... Skipping.



100%|██████████| 2/2 [00:00<00:00, 73.87it/s]
Processing Dataset:  89%|████████▉ | 1062/1195 [21:34<02:44,  1.24s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1061: Failed to create speaker representation for text: And when I'm done brushing your hair, I can even p... Skipping.



100%|██████████| 2/2 [00:00<00:00, 64.09it/s]
Processing Dataset:  89%|████████▉ | 1063/1195 [21:35<02:40,  1.21s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1062: Failed to create speaker representation for text: I love all your sides, every single one. And I wou... Skipping.



100%|██████████| 2/2 [00:00<00:00, 41.24it/s]
Processing Dataset:  89%|████████▉ | 1064/1195 [21:37<02:36,  1.19s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1063: Failed to create speaker representation for text: So, cutie, now that you're mine, I think it's time... Skipping.



100%|██████████| 2/2 [00:00<00:00, 72.48it/s]
Processing Dataset:  89%|████████▉ | 1065/1195 [21:38<02:33,  1.18s/it]
100%|██████████| 1/1 [00:00<00:00, 27.96it/s]
Processing Dataset:  89%|████████▉ | 1066/1195 [21:39<02:24,  1.12s/it]
100%|██████████| 3/3 [00:00<00:00, 85.83it/s]
Processing Dataset:  89%|████████▉ | 1067/1195 [21:40<02:31,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 57.83it/s]

100%|██████████| 2/2 [00:00<00:00, 70.52it/s]
Processing Dataset:  89%|████████▉ | 1069/1195 [21:43<02:44,  1.30s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1068: Failed to create speaker representation for text: Yeah? Mm-hmm. We really love to swim. I really wis... Skipping.



100%|██████████| 2/2 [00:00<00:00, 75.39it/s]
Processing Dataset:  90%|████████▉ | 1070/1195 [21:44<02:35,  1.24s/it]
100%|██████████| 3/3 [00:00<00:00, 80.43it/s]

100%|██████████| 2/2 [00:00<00:00, 74.46it/s]
Processing Dataset:  90%|████████▉ | 1072/1195 [21:46<02:33,  1.25s/it]
100%|██████████| 2/2 [00:00<00:00, 76.56it/s]
Processing Dataset:  90%|████████▉ | 1073/1195 [21:48<02:27,  1.21s/it]
100%|██████████| 3/3 [00:00<00:00, 87.22it/s]
Processing Dataset:  90%|████████▉ | 1074/1195 [21:49<02:27,  1.22s/it]
100%|██████████| 2/2 [00:00<00:00, 32.76it/s]

100%|██████████| 2/2 [00:00<00:00, 54.11it/s]

100%|██████████| 2/2 [00:00<00:00, 83.06it/s]
Processing Dataset:  90%|█████████ | 1077/1195 [21:52<02:16,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 65.76it/s]

100%|██████████| 2/2 [00:00<00:00, 52.30it/s]
Processing Dataset:  90%|█████████ | 1079/1195 [21:55<02:19,  1.20s/it]
100%|██████████| 2/2 [00:00<00:00, 72.42it/s]
Processing Dataset:  90%|█████████ | 1080/1195 [21:56<02:

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1079: Failed to create speaker representation for text: All right, ready? All right, so we're just gonna s... Skipping.



100%|██████████| 2/2 [00:00<00:00, 77.76it/s]
Processing Dataset:  90%|█████████ | 1081/1195 [21:57<02:09,  1.13s/it]
100%|██████████| 3/3 [00:00<00:00, 86.56it/s]
Processing Dataset:  91%|█████████ | 1082/1195 [21:58<02:14,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 69.23it/s]

100%|██████████| 2/2 [00:00<00:00, 70.96it/s]

100%|██████████| 2/2 [00:00<00:00, 56.82it/s]
Processing Dataset:  91%|█████████ | 1085/1195 [22:01<02:06,  1.15s/it]
100%|██████████| 2/2 [00:00<00:00, 77.55it/s]
Processing Dataset:  91%|█████████ | 1086/1195 [22:03<02:07,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 76.27it/s]

100%|██████████| 2/2 [00:00<00:00, 63.77it/s]

100%|██████████| 2/2 [00:00<00:00, 68.26it/s]
Processing Dataset:  91%|█████████ | 1089/1195 [22:11<04:39,  2.64s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1088: Failed to create speaker representation for text: Oh, shh... Hey, there, there, okay? It's all right... Skipping.



100%|██████████| 2/2 [00:00<00:00, 73.25it/s]
Processing Dataset:  91%|█████████ | 1090/1195 [22:12<03:53,  2.22s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1089: Failed to create speaker representation for text: Alright, alright. I'll calm down. But...I am curio... Skipping.



100%|██████████| 2/2 [00:00<00:00, 68.03it/s]
Processing Dataset:  91%|█████████▏| 1091/1195 [22:14<03:21,  1.94s/it]
100%|██████████| 1/1 [00:00<00:00, 58.88it/s]
Processing Dataset:  91%|█████████▏| 1092/1195 [22:15<02:49,  1.65s/it]
100%|██████████| 1/1 [00:00<00:00, 60.65it/s]
Processing Dataset:  91%|█████████▏| 1093/1195 [22:16<02:29,  1.46s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1092: Failed to create speaker representation for text: <sighs> Well, now I have another way to play with ... Skipping.



100%|██████████| 3/3 [00:00<00:00, 87.38it/s]
Processing Dataset:  92%|█████████▏| 1094/1195 [22:17<02:23,  1.42s/it]
100%|██████████| 2/2 [00:00<00:00, 53.14it/s]

100%|██████████| 2/2 [00:00<00:00, 74.59it/s]
Processing Dataset:  92%|█████████▏| 1096/1195 [22:19<02:11,  1.33s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1095: Failed to create speaker representation for text: <laughs> 'Cause you don't need to be coy for that,... Skipping.



100%|██████████| 2/2 [00:00<00:00, 39.98it/s]
Processing Dataset:  92%|█████████▏| 1097/1195 [22:20<02:01,  1.24s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1096: Failed to create speaker representation for text: And garlic, salt and thyme?... Skipping.



100%|██████████| 3/3 [00:00<00:00, 88.50it/s]
Processing Dataset:  92%|█████████▏| 1098/1195 [22:22<01:57,  1.21s/it]
100%|██████████| 2/2 [00:00<00:00, 76.05it/s]
Processing Dataset:  92%|█████████▏| 1099/1195 [22:23<01:53,  1.18s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1098: Failed to create speaker representation for text: You don't know. Well, I mean, do you have like a- ... Skipping.



100%|██████████| 2/2 [00:00<00:00, 79.80it/s]
Processing Dataset:  92%|█████████▏| 1100/1195 [22:24<01:53,  1.20s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1099: Failed to create speaker representation for text: Sorry, I wasn't that great a cook, cutie. We'll fi... Skipping.



100%|██████████| 2/2 [00:00<00:00, 74.21it/s]
Processing Dataset:  92%|█████████▏| 1101/1195 [22:25<01:53,  1.21s/it]
100%|██████████| 2/2 [00:00<00:00, 69.97it/s]
Processing Dataset:  92%|█████████▏| 1102/1195 [22:26<01:48,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 77.96it/s]
Processing Dataset:  92%|█████████▏| 1103/1195 [22:27<01:46,  1.15s/it]
100%|██████████| 2/2 [00:00<00:00, 76.75it/s]
Processing Dataset:  92%|█████████▏| 1104/1195 [22:28<01:41,  1.11s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1103: Failed to create speaker representation for text: Or your job or anything, no. I just genuinely like... Skipping.



100%|██████████| 1/1 [00:00<00:00, 58.65it/s]
Processing Dataset:  92%|█████████▏| 1105/1195 [22:29<01:38,  1.10s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1104: Failed to create speaker representation for text: Oh, baby, you look like you just rolled out of bed... Skipping.



100%|██████████| 3/3 [00:00<00:00, 77.66it/s]
Processing Dataset:  93%|█████████▎| 1106/1195 [22:31<01:50,  1.24s/it]
100%|██████████| 2/2 [00:00<00:00, 66.50it/s]
Processing Dataset:  93%|█████████▎| 1107/1195 [22:32<01:48,  1.24s/it]
100%|██████████| 2/2 [00:00<00:00, 64.75it/s]
Processing Dataset:  93%|█████████▎| 1108/1195 [22:33<01:44,  1.20s/it]
100%|██████████| 3/3 [00:00<00:00, 82.15it/s]

100%|██████████| 2/2 [00:00<00:00, 76.33it/s]
Processing Dataset:  93%|█████████▎| 1110/1195 [22:36<01:47,  1.26s/it]
100%|██████████| 2/2 [00:00<00:00, 74.21it/s]

100%|██████████| 2/2 [00:00<00:00, 69.90it/s]

100%|██████████| 2/2 [00:00<00:00, 71.91it/s]

100%|██████████| 1/1 [00:00<00:00, 64.84it/s]
Processing Dataset:  93%|█████████▎| 1114/1195 [22:40<01:32,  1.14s/it]
100%|██████████| 2/2 [00:00<00:00, 74.65it/s]

100%|██████████| 2/2 [00:00<00:00, 71.50it/s]
Processing Dataset:  93%|█████████▎| 1116/1195 [22:43<01:30,  1.14s/it]
100%|██████████| 3/3 [00:00<00:00, 65.03it/s]
Processing

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1117: Failed to create speaker representation for text: What's an Earth? I've never heard of that. Is it f... Skipping.



100%|██████████| 2/2 [00:00<00:00, 76.01it/s]
Processing Dataset:  94%|█████████▎| 1119/1195 [22:46<01:27,  1.16s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1118: Failed to create speaker representation for text: Now, would you be a good dear and give your favori... Skipping.



100%|██████████| 2/2 [00:00<00:00, 75.42it/s]
Processing Dataset:  94%|█████████▎| 1120/1195 [22:48<01:30,  1.20s/it]
100%|██████████| 2/2 [00:00<00:00, 84.01it/s]
Processing Dataset:  94%|█████████▍| 1121/1195 [22:49<01:28,  1.20s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1120: Failed to create speaker representation for text: But my master, he found me once on the street when... Skipping.



100%|██████████| 2/2 [00:00<00:00, 77.32it/s]
Processing Dataset:  94%|█████████▍| 1122/1195 [22:50<01:29,  1.23s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1121: Failed to create speaker representation for text: Well, it always helps me when I'm feeling stressed... Skipping.



100%|██████████| 2/2 [00:00<00:00, 68.22it/s]
Processing Dataset:  94%|█████████▍| 1123/1195 [22:51<01:26,  1.20s/it]
100%|██████████| 2/2 [00:00<00:00, 75.77it/s]
Processing Dataset:  94%|█████████▍| 1124/1195 [22:52<01:22,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 76.84it/s]
Processing Dataset:  94%|█████████▍| 1125/1195 [22:53<01:21,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 63.09it/s]
Processing Dataset:  94%|█████████▍| 1126/1195 [22:55<01:19,  1.15s/it]
100%|██████████| 2/2 [00:00<00:00, 53.18it/s]
Processing Dataset:  94%|█████████▍| 1127/1195 [22:56<01:18,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 68.25it/s]
Processing Dataset:  94%|█████████▍| 1128/1195 [22:57<01:19,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 81.13it/s]
Processing Dataset:  94%|█████████▍| 1129/1195 [22:58<01:16,  1.16s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1128: Failed to create speaker representation for text: To hide your bad days or, you know, put on a front... Skipping.



100%|██████████| 2/2 [00:00<00:00, 73.83it/s]
Processing Dataset:  95%|█████████▍| 1130/1195 [22:59<01:14,  1.15s/it]
100%|██████████| 2/2 [00:00<00:00, 70.34it/s]
Processing Dataset:  95%|█████████▍| 1131/1195 [23:00<01:14,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 78.82it/s]
Processing Dataset:  95%|█████████▍| 1132/1195 [23:02<01:12,  1.15s/it]
100%|██████████| 2/2 [00:00<00:00, 81.08it/s]

100%|██████████| 2/2 [00:00<00:00, 72.05it/s]

100%|██████████| 2/2 [00:00<00:00, 71.44it/s]
Processing Dataset:  95%|█████████▍| 1135/1195 [23:05<01:11,  1.19s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1134: Failed to create speaker representation for text: Shh, shh, shh, shh. No, no, no, no. Don't you dare... Skipping.



100%|██████████| 2/2 [00:00<00:00, 65.94it/s]
Processing Dataset:  95%|█████████▌| 1136/1195 [23:06<01:09,  1.18s/it]
100%|██████████| 2/2 [00:00<00:00, 62.07it/s]
Processing Dataset:  95%|█████████▌| 1137/1195 [23:08<01:13,  1.26s/it]
100%|██████████| 2/2 [00:00<00:00, 90.92it/s]

100%|██████████| 2/2 [00:00<00:00, 41.48it/s]
Processing Dataset:  95%|█████████▌| 1139/1195 [23:10<01:07,  1.21s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1138: Failed to create speaker representation for text: Ah, they're probably halfway to Andromeda by now. ... Skipping.



100%|██████████| 2/2 [00:00<00:00, 67.93it/s]

100%|██████████| 2/2 [00:00<00:00, 67.65it/s]

100%|██████████| 2/2 [00:00<00:00, 73.08it/s]
Processing Dataset:  96%|█████████▌| 1142/1195 [23:14<01:03,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 70.76it/s]
Processing Dataset:  96%|█████████▌| 1143/1195 [23:15<00:58,  1.12s/it]
100%|██████████| 1/1 [00:00<00:00, 57.94it/s]
Processing Dataset:  96%|█████████▌| 1144/1195 [23:16<00:55,  1.10s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1143: Failed to create speaker representation for text: So much. And I'll have something whipped up for yo... Skipping.



100%|██████████| 2/2 [00:00<00:00, 84.20it/s]
Processing Dataset:  96%|█████████▌| 1145/1195 [23:17<00:55,  1.11s/it]
100%|██████████| 2/2 [00:00<00:00, 74.57it/s]
Processing Dataset:  96%|█████████▌| 1146/1195 [23:18<00:57,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 67.36it/s]

100%|██████████| 2/2 [00:00<00:00, 45.50it/s]
Processing Dataset:  96%|█████████▌| 1148/1195 [23:21<00:56,  1.21s/it]
100%|██████████| 2/2 [00:00<00:00, 58.91it/s]
Processing Dataset:  96%|█████████▌| 1149/1195 [23:22<00:53,  1.16s/it]
100%|██████████| 1/1 [00:00<00:00, 58.32it/s]
Processing Dataset:  96%|█████████▌| 1150/1195 [23:23<00:50,  1.12s/it]
100%|██████████| 2/2 [00:00<00:00, 71.96it/s]
Processing Dataset:  96%|█████████▋| 1151/1195 [23:24<00:49,  1.13s/it]
100%|██████████| 2/2 [00:00<00:00, 76.43it/s]
Processing Dataset:  96%|█████████▋| 1152/1195 [23:25<00:47,  1.11s/it]
100%|██████████| 2/2 [00:00<00:00, 46.63it/s]
Processing Dataset:  96%|█████████▋| 1153/1195 [23:26<00:44,  1.07s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1152: Failed to create speaker representation for text: Oh, no. You look so crushed. No, I was just playin... Skipping.



100%|██████████| 2/2 [00:00<00:00, 73.08it/s]
Processing Dataset:  97%|█████████▋| 1154/1195 [23:27<00:42,  1.05s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1153: Failed to create speaker representation for text: Okay? Yes.... Skipping.



100%|██████████| 2/2 [00:00<00:00, 71.96it/s]
Processing Dataset:  97%|█████████▋| 1155/1195 [23:28<00:44,  1.10s/it]
100%|██████████| 2/2 [00:00<00:00, 85.72it/s]
Processing Dataset:  97%|█████████▋| 1156/1195 [23:29<00:42,  1.10s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1155: Failed to create speaker representation for text: And he was a bold man. He defended us when the wol... Skipping.



100%|██████████| 2/2 [00:00<00:00, 43.47it/s]

100%|██████████| 2/2 [00:00<00:00, 77.41it/s]
Processing Dataset:  97%|█████████▋| 1158/1195 [23:32<00:43,  1.17s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1157: Failed to create speaker representation for text: The war in Heaven was a war that the Necrontyr cou... Skipping.



100%|██████████| 2/2 [00:00<00:00, 75.06it/s]
Processing Dataset:  97%|█████████▋| 1159/1195 [23:33<00:44,  1.23s/it]
100%|██████████| 2/2 [00:00<00:00, 72.52it/s]
Processing Dataset:  97%|█████████▋| 1160/1195 [23:34<00:41,  1.19s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1159: Failed to create speaker representation for text: ...was being influenced by an outside force, I don... Skipping.



100%|██████████| 2/2 [00:00<00:00, 69.67it/s]
Processing Dataset:  97%|█████████▋| 1161/1195 [23:35<00:39,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 75.75it/s]
Processing Dataset:  97%|█████████▋| 1162/1195 [23:36<00:37,  1.15s/it]
100%|██████████| 2/2 [00:00<00:00, 70.01it/s]

100%|██████████| 2/2 [00:00<00:00, 69.62it/s]
Processing Dataset:  97%|█████████▋| 1164/1195 [23:39<00:36,  1.16s/it]
100%|██████████| 2/2 [00:00<00:00, 75.37it/s]
Processing Dataset:  97%|█████████▋| 1165/1195 [23:40<00:34,  1.17s/it]
100%|██████████| 3/3 [00:00<00:00, 80.25it/s]
Processing Dataset:  98%|█████████▊| 1166/1195 [23:41<00:34,  1.18s/it]
100%|██████████| 1/1 [00:00<00:00, 66.34it/s]
Processing Dataset:  98%|█████████▊| 1167/1195 [23:42<00:31,  1.13s/it]
100%|██████████| 2/2 [00:00<00:00, 59.10it/s]
Processing Dataset:  98%|█████████▊| 1168/1195 [23:43<00:29,  1.09s/it]
100%|██████████| 2/2 [00:00<00:00, 48.86it/s]
Processing Dataset:  98%|█████████▊| 1169/1195 [23:44<00:29,  1.12s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1168: Failed to create speaker representation for text: Twice a month. He was downing a couple people a we... Skipping.



100%|██████████| 2/2 [00:00<00:00, 71.59it/s]
Processing Dataset:  98%|█████████▊| 1170/1195 [23:45<00:28,  1.15s/it]
100%|██████████| 2/2 [00:00<00:00, 73.29it/s]
Processing Dataset:  98%|█████████▊| 1171/1195 [23:46<00:26,  1.10s/it]
100%|██████████| 2/2 [00:00<00:00, 75.06it/s]
Processing Dataset:  98%|█████████▊| 1172/1195 [23:47<00:25,  1.09s/it]
100%|██████████| 2/2 [00:00<00:00, 65.48it/s]
Processing Dataset:  98%|█████████▊| 1173/1195 [23:49<00:24,  1.13s/it]
100%|██████████| 2/2 [00:00<00:00, 73.04it/s]
Processing Dataset:  98%|█████████▊| 1174/1195 [23:50<00:23,  1.11s/it]
100%|██████████| 2/2 [00:00<00:00, 74.60it/s]
Processing Dataset:  98%|█████████▊| 1175/1195 [23:51<00:22,  1.10s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1174: Failed to create speaker representation for text: There's a limit, you know? I'm basically a bloodsu... Skipping.



100%|██████████| 2/2 [00:00<00:00, 84.77it/s]
Processing Dataset:  98%|█████████▊| 1176/1195 [23:52<00:20,  1.09s/it]
100%|██████████| 2/2 [00:00<00:00, 75.59it/s]

100%|██████████| 2/2 [00:00<00:00, 71.08it/s]
Processing Dataset:  99%|█████████▊| 1178/1195 [23:54<00:20,  1.19s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1177: Failed to create speaker representation for text: H-huh? N-no, sorry, I-I just kind of spaced out. <... Skipping.



100%|██████████| 2/2 [00:00<00:00, 63.53it/s]
Processing Dataset:  99%|█████████▊| 1179/1195 [23:56<00:18,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 62.97it/s]
Processing Dataset:  99%|█████████▊| 1180/1195 [23:57<00:18,  1.22s/it]
100%|██████████| 2/2 [00:00<00:00, 71.73it/s]

100%|██████████| 2/2 [00:00<00:00, 63.59it/s]
Processing Dataset:  99%|█████████▉| 1182/1195 [23:59<00:15,  1.19s/it]
100%|██████████| 2/2 [00:00<00:00, 34.62it/s]
Processing Dataset:  99%|█████████▉| 1183/1195 [24:00<00:14,  1.18s/it]

Error during speaker creation (Whisper/AudioProcessor): cuFFT error: CUFFT_INTERNAL_ERROR
Row 1182: Failed to create speaker representation for text: It means a lot, and I'm glad you can trust me with... Skipping.



100%|██████████| 2/2 [00:00<00:00, 64.38it/s]

100%|██████████| 3/3 [00:00<00:00, 82.17it/s]
Processing Dataset:  99%|█████████▉| 1185/1195 [24:03<00:11,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 68.38it/s]
Processing Dataset:  99%|█████████▉| 1186/1195 [24:04<00:10,  1.14s/it]
100%|██████████| 2/2 [00:00<00:00, 77.05it/s]
Processing Dataset:  99%|█████████▉| 1187/1195 [24:05<00:09,  1.14s/it]
100%|██████████| 2/2 [00:00<00:00, 67.33it/s]
Processing Dataset:  99%|█████████▉| 1188/1195 [24:06<00:08,  1.17s/it]
100%|██████████| 2/2 [00:00<00:00, 59.51it/s]
Processing Dataset:  99%|█████████▉| 1189/1195 [24:07<00:07,  1.22s/it]
100%|██████████| 2/2 [00:00<00:00, 37.12it/s]

100%|██████████| 2/2 [00:00<00:00, 70.29it/s]
Processing Dataset: 100%|█████████▉| 1191/1195 [24:10<00:04,  1.25s/it]
100%|██████████| 2/2 [00:00<00:00, 79.52it/s]

100%|██████████| 2/2 [00:00<00:00, 71.33it/s]
Processing Dataset: 100%|█████████▉| 1193/1195 [24:12<00:02,  1.25s/it]
100%|██████████| 2/2 [00:00<00:0

Dataset processing finished. Processed: 1122, Skipped: 73
Moving Whisper model to CPU


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1122 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,122 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 13,631,488/1,262,028,800 (1.08% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.1446
2,3.2555
3,3.2292
4,2.9754
5,3.1771
6,3.1414
7,3.2527
8,3.0723
9,3.2049
10,3.1952


In [24]:
input_text = "Hello World! Im a speech generation model using Oute TTS."

In [25]:
#@title Run Inference

import torch
import re
import numpy as np
from typing import Dict, Any
import torchaudio.transforms as T
from transformers import LogitsProcessor
import transformers.generation.utils as generation_utils
from transformers import AutoModelForCausalLM
import re

def get_audio(tokens):
        decoded_output = tokenizer.batch_decode(tokens, skip_special_tokens=False)[0]
        c1 = list(map(int,re.findall(r"<\|c1_(\d+)\|>", decoded_output)))
        c2 = list(map(int,re.findall(r"<\|c2_(\d+)\|>", decoded_output)))

        t = min(len(c1), len(c2))
        c1 = c1[:t]
        c2 = c2[:t]
        output = [c1,c2]
        if not output:
            print("No audio tokens found in the output")
            return None

        return data_processor.audio_processor.audio_codec.decode(
            torch.tensor([output], dtype=torch.int64).to(data_processor.audio_processor.audio_codec.device)
        )

class RepetitionPenaltyLogitsProcessorPatch(LogitsProcessor):
    def __init__(self, penalty: float):
        penalty_last_n = 64
        print("🔄 Using patched RepetitionPenaltyLogitsProcessor -> RepetitionPenaltyLogitsProcessorPatch | penalty_last_n: {penalty_last_n}")
        if penalty_last_n is not None:
            if not isinstance(penalty_last_n, int) or penalty_last_n < 0:
                raise ValueError(f"`penalty_last_n` has to be a non-negative integer, but is {penalty_last_n}")
        if not isinstance(penalty, float) or penalty <= 0:
            raise ValueError(f"`penalty` has to be a positive float, but is {penalty}")

        self.penalty_last_n = penalty_last_n
        self.penalty = penalty

    @torch.no_grad()
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        """
        Args:
            input_ids (`torch.LongTensor`):
                Indices of input sequence tokens in the vocabulary (shape `(batch_size, sequence_length)`).
            scores (`torch.FloatTensor`):
                Prediction scores of a language modeling head (shape `(batch_size, vocab_size)`).

        Returns:
            `torch.FloatTensor`: The modified prediction scores.
        """
        # Check if penalties should be applied
        if self.penalty_last_n == 0 or self.penalty == 1.0:
            return scores

        batch_size, seq_len = input_ids.shape
        vocab_size = scores.shape[-1]

        # Process each batch item independently
        for b in range(batch_size):
            # 1. Determine the penalty window
            start_index = max(0, seq_len - self.penalty_last_n)
            window_indices = input_ids[b, start_index:] # Shape: (window_len,)

            if window_indices.numel() == 0: # Skip if window is empty
                continue

            # 2. Find unique tokens within the window
            tokens_in_window = set(window_indices.tolist())

            # 3. Apply repetition penalty to the scores for this batch item
            for token_id in tokens_in_window:
                if token_id >= vocab_size:
                    continue

                logit = scores[b, token_id]

                if logit <= 0:
                    logit *= self.penalty
                else:
                    logit /= self.penalty

                # Update the score
                scores[b, token_id] = logit

        return scores

generation_utils.RepetitionPenaltyLogitsProcessor = RepetitionPenaltyLogitsProcessorPatch
AutoModelForCausalLM.generate = generation_utils.GenerationMixin.generate

if __name__ == "__main__":
    formated_text = "<|text_start|>"+input_text+"<|text_end|>"
    prompt = "\n".join([
        "<|im_start|>",
        formated_text,
        "<|audio_start|><|global_features_start|>",
    ])

    model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    print("Generating token sequence...")
    generated_ids = model.generate(
        **model_inputs,
        temperature=0.4,
        top_k=40,
        top_p=0.9,
        repetition_penalty=1.1,
        min_p=0.05,
        max_new_tokens=2048, # Limit generation length
    )
    print("Token sequence generated.")


    generated_ids_trimmed = generated_ids[:, model_inputs.input_ids.shape[1]:]
    audio = get_audio(generated_ids)
    audio = audio.cpu()
    from IPython.display import Audio, display
    display(Audio(audio.squeeze(0), rate=24000))


Generating token sequence...
🔄 Using patched RepetitionPenaltyLogitsProcessor -> RepetitionPenaltyLogitsProcessorPatch | penalty_last_n: {penalty_last_n}
Token sequence generated.
