## Convert MP3 and Remove Ads/Silence

In [2]:
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_silence
import numpy as np

def convert_mp3_to_wav(mp3_path, output_path="/Users/keshikaa/new-voice-clone-project/data/raw/practical_ai_episodes/Behind-the-Scenes-VC-Funding-for-AI-Startups.mp3"):
    """
    Convert MP3 to WAV format required for processing
    """
    print("Converting MP3 to WAV...")
    audio = AudioSegment.from_mp3(mp3_path)
    
    # Convert to mono and set to 22050 Hz (optimal for TTS)
    audio = audio.set_channels(1)
    audio = audio.set_frame_rate(22050)
    
    audio.export(output_path, format="wav")
    print(f"Converted audio saved to: {output_path}")
    return output_path

def remove_long_silences(audio_path, output_path="cleaned_audio.wav", 
                         silence_thresh=-40, min_silence_len=2000):
    """
    Remove long silences (likely ad breaks) from podcast
    Keeps natural pauses in speech (< 2 seconds)
    """
    print("Removing long silences and potential ad breaks...")
    
    # Load audio
    audio = AudioSegment.from_file(audio_path)
    
    # Detect silence periods
    silence_ranges = detect_silence(
        audio,
        min_silence_len=min_silence_len,  # 2 seconds minimum
        silence_thresh=silence_thresh
    )
    
    print(f"Found {len(silence_ranges)} silent segments to remove")
    
    # Keep only non-silent parts
    audio_chunks = []
    prev_end = 0
    
    for start, end in silence_ranges:
        # Add the audio before this silence
        if start > prev_end:
            audio_chunks.append(audio[prev_end:start])
        prev_end = end
    
    # Add the last chunk
    if prev_end < len(audio):
        audio_chunks.append(audio[prev_end:])
    
    # Combine all chunks
    cleaned_audio = sum(audio_chunks)
    
    original_duration = len(audio) / 1000.0  # in seconds
    cleaned_duration = len(cleaned_audio) / 1000.0
    
    print(f"Original duration: {original_duration:.2f} seconds")
    print(f"Cleaned duration: {cleaned_duration:.2f} seconds")
    print(f"Removed: {original_duration - cleaned_duration:.2f} seconds")
    
    cleaned_audio.export(output_path, format="wav")
    return output_path

# Use your MP3 file
mp3_file = "/Users/keshikaa/new-voice-clone-project/data/raw/practical_ai_episodes/Behind-the-Scenes-VC-Funding-for-AI-Startups.mp3"  # Replace with your file name

# Convert and clean
wav_file = convert_mp3_to_wav(mp3_file)
cleaned_file = remove_long_silences(wav_file)



Converting MP3 to WAV...
Converted audio saved to: /Users/keshikaa/new-voice-clone-project/data/raw/practical_ai_episodes/Behind-the-Scenes-VC-Funding-for-AI-Startups.mp3
Removing long silences and potential ad breaks...
Found 0 silent segments to remove
Original duration: 2507.66 seconds
Cleaned duration: 2507.66 seconds
Removed: 0.00 seconds


## Segment Audio into Training Clips

In [6]:
def segment_audio_for_training(audio_path, output_dir="training_segments",
                                min_segment_len=3000, max_segment_len=15000):
    """
    Segment audio into 3-15 second clips optimal for voice cloning
    """
    os.makedirs(output_dir, exist_ok=True)
    
    print("Segmenting audio for training...")
    audio = AudioSegment.from_file(audio_path)
    
    # Split on natural pauses (shorter silences)
    segments = split_on_silence(
        audio,
        min_silence_len=500,  # 0.5 seconds
        silence_thresh=-40,
        keep_silence=100  # Keep some silence at edges
    )
    
    valid_segments = []
    segment_info = []
    
    for i, segment in enumerate(segments):
        seg_len = len(segment)
        
        # Filter by length
        if min_segment_len <= seg_len <= max_segment_len:
            output_path = os.path.join(output_dir, f"segment_{i:04d}.wav")
            segment.export(output_path, format="wav")
            valid_segments.append(output_path)
            
            segment_info.append({
                "segment_id": i,
                "path": output_path,
                "duration": seg_len / 1000.0
            })
    
    print(f"Created {len(valid_segments)} training segments")
    print(f"Total training audio: {sum([s['duration'] for s in segment_info]):.2f} seconds")
    
    import pandas as pd
    df = pd.DataFrame(segment_info)
    df.to_csv(os.path.join(output_dir, "segments.csv"), index=False)
    
    return valid_segments, segment_info

# Segment the cleaned audio
segments, seg_info = segment_audio_for_training(cleaned_file)


Segmenting audio for training...
Created 217 training segments
Total training audio: 1391.39 seconds


## Transcribe with Whisper

In [9]:
!pip install -U openai-whisper
import whisper
import json

def transcribe_segments(segment_paths, model_size="base"):
    """
    Transcribe audio segments using Whisper
    Note: Whisper automatically uses Mac's GPU (MPS) if available
    """
    print(f"Loading Whisper model: {model_size}")
    model = whisper.load_model(model_size)
    
    transcriptions = []
    
    for i, segment_path in enumerate(segment_paths):
        print(f"Transcribing segment {i+1}/{len(segment_paths)}...")
        
        result = model.transcribe(
            segment_path,
            language="en",  # Change if needed
            word_timestamps=True
        )
        
        transcriptions.append({
            "audio_path": segment_path,
            "text": result["text"].strip(),
            "duration": seg_info[i]['duration']
        })
    
    # Save transcriptions
    with open("transcriptions.json", "w") as f:
        json.dump(transcriptions, f, indent=2)
    
    print(f"\nCompleted transcription of {len(transcriptions)} segments")
    return transcriptions

# Transcribe all segments
transcriptions = transcribe_segments(segments)

# Preview transcriptions
import pandas as pd
df = pd.DataFrame(transcriptions)
print("\nTranscription Preview:")
print(df.head(10))


Collecting openai-whisper
  Using cached openai_whisper-20250625.tar.gz (803 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting tiktoken
  Downloading tiktoken-0.12.0-cp39-cp39-macosx_11_0_arm64.whl (997 kB)
[K     |████████████████████████████████| 997 kB 6.1 MB/s eta 0:00:01
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (PEP 517) ... [?25ldone
[?25h  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=cb879531f71cf85e618556674461958bc809928279bed84eabba205918630c1f
  Stored in directory: /Users/keshikaa/Library/Caches/pip/wheels/8a/52/46/c497a169da69d4edcfe4e66e2f597ce258c334d74d371bf8c9
Successfully built openai-whisper
Installing collected packages: tiktoken, openai-whisper
Successfully installed openai-whisper-20250625 tiktoken-0.12.0
You should consider upgrading v

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Transcribing segment 2/217...
Transcribing segment 3/217...




Transcribing segment 4/217...




Transcribing segment 5/217...




Transcribing segment 6/217...




Transcribing segment 7/217...




Transcribing segment 8/217...




Transcribing segment 9/217...




Transcribing segment 10/217...




Transcribing segment 11/217...




Transcribing segment 12/217...




Transcribing segment 13/217...




Transcribing segment 14/217...




Transcribing segment 15/217...




Transcribing segment 16/217...




Transcribing segment 17/217...




Transcribing segment 18/217...




Transcribing segment 19/217...




Transcribing segment 20/217...




Transcribing segment 21/217...




Transcribing segment 22/217...




Transcribing segment 23/217...




Transcribing segment 24/217...




Transcribing segment 25/217...




Transcribing segment 26/217...




Transcribing segment 27/217...




Transcribing segment 28/217...




Transcribing segment 29/217...




Transcribing segment 30/217...




Transcribing segment 31/217...




Transcribing segment 32/217...




Transcribing segment 33/217...




Transcribing segment 34/217...




Transcribing segment 35/217...




Transcribing segment 36/217...




Transcribing segment 37/217...




Transcribing segment 38/217...




Transcribing segment 39/217...




Transcribing segment 40/217...




Transcribing segment 41/217...




Transcribing segment 42/217...




Transcribing segment 43/217...




Transcribing segment 44/217...




Transcribing segment 45/217...




Transcribing segment 46/217...




Transcribing segment 47/217...




Transcribing segment 48/217...




Transcribing segment 49/217...




Transcribing segment 50/217...




Transcribing segment 51/217...




Transcribing segment 52/217...




Transcribing segment 53/217...




Transcribing segment 54/217...




Transcribing segment 55/217...




Transcribing segment 56/217...




Transcribing segment 57/217...




Transcribing segment 58/217...




Transcribing segment 59/217...




Transcribing segment 60/217...




Transcribing segment 61/217...




Transcribing segment 62/217...




Transcribing segment 63/217...




Transcribing segment 64/217...




Transcribing segment 65/217...




Transcribing segment 66/217...




Transcribing segment 67/217...




Transcribing segment 68/217...




Transcribing segment 69/217...




Transcribing segment 70/217...




Transcribing segment 71/217...




Transcribing segment 72/217...




Transcribing segment 73/217...




Transcribing segment 74/217...




Transcribing segment 75/217...




Transcribing segment 76/217...




Transcribing segment 77/217...




Transcribing segment 78/217...




Transcribing segment 79/217...




Transcribing segment 80/217...




Transcribing segment 81/217...




Transcribing segment 82/217...




Transcribing segment 83/217...




Transcribing segment 84/217...




Transcribing segment 85/217...




Transcribing segment 86/217...




Transcribing segment 87/217...




Transcribing segment 88/217...




Transcribing segment 89/217...




Transcribing segment 90/217...




Transcribing segment 91/217...




Transcribing segment 92/217...




Transcribing segment 93/217...




Transcribing segment 94/217...




Transcribing segment 95/217...




Transcribing segment 96/217...




Transcribing segment 97/217...




Transcribing segment 98/217...




Transcribing segment 99/217...




Transcribing segment 100/217...




Transcribing segment 101/217...




Transcribing segment 102/217...




Transcribing segment 103/217...




Transcribing segment 104/217...




Transcribing segment 105/217...




Transcribing segment 106/217...




Transcribing segment 107/217...




Transcribing segment 108/217...




Transcribing segment 109/217...




Transcribing segment 110/217...




Transcribing segment 111/217...




Transcribing segment 112/217...




Transcribing segment 113/217...




Transcribing segment 114/217...




Transcribing segment 115/217...




Transcribing segment 116/217...




Transcribing segment 117/217...




Transcribing segment 118/217...




Transcribing segment 119/217...




Transcribing segment 120/217...




Transcribing segment 121/217...




Transcribing segment 122/217...




Transcribing segment 123/217...




Transcribing segment 124/217...




Transcribing segment 125/217...




Transcribing segment 126/217...




Transcribing segment 127/217...




Transcribing segment 128/217...




Transcribing segment 129/217...




Transcribing segment 130/217...




Transcribing segment 131/217...




Transcribing segment 132/217...




Transcribing segment 133/217...




Transcribing segment 134/217...




Transcribing segment 135/217...




Transcribing segment 136/217...




Transcribing segment 137/217...




Transcribing segment 138/217...




Transcribing segment 139/217...




Transcribing segment 140/217...




Transcribing segment 141/217...




Transcribing segment 142/217...




Transcribing segment 143/217...




Transcribing segment 144/217...




Transcribing segment 145/217...




Transcribing segment 146/217...




Transcribing segment 147/217...




Transcribing segment 148/217...




Transcribing segment 149/217...




Transcribing segment 150/217...




Transcribing segment 151/217...




Transcribing segment 152/217...




Transcribing segment 153/217...




Transcribing segment 154/217...




Transcribing segment 155/217...




Transcribing segment 156/217...




Transcribing segment 157/217...




Transcribing segment 158/217...




Transcribing segment 159/217...




Transcribing segment 160/217...




Transcribing segment 161/217...




Transcribing segment 162/217...




Transcribing segment 163/217...




Transcribing segment 164/217...




Transcribing segment 165/217...




Transcribing segment 166/217...




Transcribing segment 167/217...




Transcribing segment 168/217...




Transcribing segment 169/217...




Transcribing segment 170/217...




Transcribing segment 171/217...




Transcribing segment 172/217...




Transcribing segment 173/217...




Transcribing segment 174/217...




Transcribing segment 175/217...




Transcribing segment 176/217...




Transcribing segment 177/217...




Transcribing segment 178/217...




Transcribing segment 179/217...




Transcribing segment 180/217...




Transcribing segment 181/217...




Transcribing segment 182/217...




Transcribing segment 183/217...




Transcribing segment 184/217...




Transcribing segment 185/217...




Transcribing segment 186/217...




Transcribing segment 187/217...




Transcribing segment 188/217...




Transcribing segment 189/217...




Transcribing segment 190/217...




Transcribing segment 191/217...




Transcribing segment 192/217...




Transcribing segment 193/217...




Transcribing segment 194/217...




Transcribing segment 195/217...




Transcribing segment 196/217...




Transcribing segment 197/217...




Transcribing segment 198/217...




Transcribing segment 199/217...




Transcribing segment 200/217...




Transcribing segment 201/217...




Transcribing segment 202/217...




Transcribing segment 203/217...




Transcribing segment 204/217...




Transcribing segment 205/217...




Transcribing segment 206/217...




Transcribing segment 207/217...




Transcribing segment 208/217...




Transcribing segment 209/217...




Transcribing segment 210/217...




Transcribing segment 211/217...




Transcribing segment 212/217...




Transcribing segment 213/217...




Transcribing segment 214/217...




Transcribing segment 215/217...




Transcribing segment 216/217...




Transcribing segment 217/217...





Completed transcription of 217 segments

Transcription Preview:
                           audio_path  \
0  training_segments/segment_0001.wav   
1  training_segments/segment_0003.wav   
2  training_segments/segment_0007.wav   
3  training_segments/segment_0009.wav   
4  training_segments/segment_0011.wav   
5  training_segments/segment_0012.wav   
6  training_segments/segment_0013.wav   
7  training_segments/segment_0014.wav   
8  training_segments/segment_0018.wav   
9  training_segments/segment_0019.wav   

                                                text  duration  
0  Principal AI research engineer at Lockheed Mar...    13.046  
1  just I see all the time in the news, we have a...     9.323  
2  or bad of AI startups actually this last week,...     5.300  
3            you know, hyped AI startup that raised.     3.303  
4  was valued at 1.5 billion builder.ai or builde...     6.204  
5  And I think basically it collapsed. There were...     4.454  
6  these, I don't know every

## Prepare Dataset for Training

In [10]:
def prepare_training_dataset(transcriptions, output_dir="final_training_data"):
    """
    Create final training dataset with metadata file
    """
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.join(output_dir, "wavs"), exist_ok=True)
    
    metadata_lines = []
    valid_count = 0
    
    for idx, trans in enumerate(transcriptions):
        text = trans["text"].strip()
        
        # Skip empty or very short transcriptions
        if len(text) < 10:
            continue
        
        # Load and resample audio to 22050 Hz
        import librosa
        import soundfile as sf
        
        audio, sr = librosa.load(trans["audio_path"], sr=22050)
        
        # Save in wavs directory
        audio_filename = f"audio_{valid_count:05d}.wav"
        audio_path = os.path.join(output_dir, "wavs", audio_filename)
        sf.write(audio_path, audio, 22050)
        
        # Create metadata line: filename|text|speaker
        metadata_lines.append(f"{audio_filename}|{text}|podcast_host")
        valid_count += 1
    
    # Save metadata.txt (format required by XTTS)
    metadata_path = os.path.join(output_dir, "metadata.txt")
    with open(metadata_path, "w", encoding="utf-8") as f:
        f.write("\n".join(metadata_lines))
    
    print(f"\nCreated training dataset:")
    print(f"- {valid_count} audio files")
    print(f"- Location: {output_dir}")
    print(f"- Meta {metadata_path}")
    
    return output_dir

# Prepare final dataset
training_dir = prepare_training_dataset(transcriptions)



Created training dataset:
- 217 audio files
- Location: final_training_data
- Meta final_training_data/metadata.txt


## Voice Cloning with XTTS-v2

In [9]:
# from TTS.api import TTS
# import torch

# # Check if Mac GPU (MPS) is available
# device = "mps" if torch.backends.mps.is_available() else "cpu"
# print(f"Using device: {device}")

# def setup_voice_cloning(reference_segments):
#     """
#     Setup XTTS-v2 for voice cloning
#     Note: May need to use CPU on Mac due to MPS compatibility issues
#     """
#     print("Loading XTTS-v2 model...")
    
#     # Initialize model (use cpu for Mac stability)
#     tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", 
#               gpu=False)  # Set to False for Mac
    
#     # Select best reference clips (clear speech, 6-10 seconds total)
#     reference_clips = reference_segments[:3]  # Use first 3 segments
    
#     print(f"Using {len(reference_clips)} reference clips for voice profile")
#     return tts, reference_clips

# # Setup cloning
# tts, reference_clips = setup_voice_cloning(segments)


Using device: mps
Loading XTTS-v2 model...
 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL TTS.tts.configs.xtts_config.XttsConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([TTS.tts.configs.xtts_config.XttsConfig])` or the `torch.serialization.safe_globals([TTS.tts.configs.xtts_config.XttsConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [10]:
# # Run this to verify your PyTorch version
# import torch
# print(f"PyTorch version: {torch.__version__}")
# print(f"Expected: 2.5.1")

# # Test if TTS loads
# from TTS.api import TTS
# print("TTS library loaded successfully!")

# # Now try loading the model
# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
# print("✓ XTTS-v2 model loaded successfully!")

PyTorch version: 2.9.0
Expected: 2.5.1
TTS library loaded successfully!
 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL TTS.tts.configs.xtts_config.XttsConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([TTS.tts.configs.xtts_config.XttsConfig])` or the `torch.serialization.safe_globals([TTS.tts.configs.xtts_config.XttsConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [12]:
# # First, find where TTS is installed
# import TTS
# import os
# print(TTS.__file__)  # This will show you the path

/opt/anaconda3/envs/tts_env/lib/python3.11/site-packages/TTS/__init__.py
