### Parse Transcripts

In [1]:
import re
import os

def parse_transcript_md(file_path):
    speaker_pattern = re.compile(r'\*\*(.+?):\*\*')  
    segments = []
    
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    
    current_speaker = None
    current_text = []
    
    for line in lines:
        speaker_match = speaker_pattern.match(line.strip())
        if speaker_match:
          
            if current_speaker is not None and current_text:
                segments.append({
                    "speaker": current_speaker.strip(),
                    "text": " ".join(current_text).strip()
                })
            current_speaker = speaker_match.group(1)
            current_text = [line[speaker_match.end():].strip()]
        else:
            if line.strip():
                current_text.append(line.strip())
    

    if current_speaker and current_text:
        segments.append({
            "speaker": current_speaker.strip(),
            "text": " ".join(current_text).strip()
        })
    
    return segments

def parse_all_transcripts(transcripts_folder):
    all_parsed = {}
    for filename in os.listdir(transcripts_folder):
        if filename.endswith(".md"):
            file_path = os.path.join(transcripts_folder, filename)
            segments = parse_transcript_md(file_path)
            all_parsed[filename] = segments
    return all_parsed


transcripts_folder_path = "/Users/kiran/Desktop/TDM/Sample Train Set /Transcripts" 
all_transcripts_parsed = parse_all_transcripts(transcripts_folder_path)

for fname, segments in all_transcripts_parsed.items():
    print(f"\nFile: {fname} - Total segments extracted: {len(segments)}")
    for seg in segments[:3]:  # Show first 3 segments per file
        print(f"Speaker: {seg['speaker']}\nText: {seg['text']}\n")



File: episode_295.md - Total segments extracted: 46
Speaker: Daniel Whitenack
Text: In this Fully Connected episode of the show Chris and I will keep you fully connected with everything that's happening in the world of AI, and discuss some of the latest trends and share some learning resources for you to level up your machine learning and AI game. I'm Daniel Whitenack. I am CEO at PredictionGuard, where we're creating a private, secure AI platform. And I'm joined as always by my co-host, Chris Benson, who is a principal AI research engineer at Lockheed Martin. How are you doing, Chris?

Speaker: Chris Benson
Text: Doing very well, Daniel. I know you're out traveling, and ironically, I think I'll be where you are next week, but I think you'll be gone by then.

Speaker: Daniel Whitenack
Text: Swapping places.


File: episode_282.md - Total segments extracted: 43
Speaker: Daniel Whitenack
Text: Welcome to another Fully Connected episode of the Practical AI Podcast. This is Daniel Whitena

### Generate transcripts with segment and word level timestamps

In [5]:
import os
import whisper

def transcribe_all_audio_with_timestamps(audio_folder):
    """
    Transcribe all mp3 audio files in a folder using Whisper with word-level timestamps.
    
    Args:
        audio_folder (str): Path to folder containing audio files
        
    Returns:
        dict: Mapping of filename -> transcription result with timestamps
    """

    model = whisper.load_model("small", device="cpu") #Small for now because I traded off accuracy for time
    
    transcription_results = {}
    
    for filename in os.listdir(audio_folder):
        if filename.endswith(".mp3"):
            audio_path = os.path.join(audio_folder, filename)
            print(f"Transcribing {filename} ...")
            result = model.transcribe(audio_path, word_timestamps=True)
            transcription_results[filename] = result
            print(f"Done with {filename}. Segments: {len(result['segments'])}")
    
    return transcription_results


audio_folder_path = "/Users/kiran/Desktop/TDM/Sample Train Set /Audio"
all_transcriptions = transcribe_all_audio_with_timestamps(audio_folder_path)


first_file = next(iter(all_transcriptions))
for segment in all_transcriptions[first_file]["segments"]:
    print(f"Segment [{segment['start']:.2f} - {segment['end']:.2f}]: {segment['text']}")
    for word in segment.get("words", []):
        print(f"  Word: {word['word']} [{word['start']:.2f} - {word['end']:.2f}]")


Transcribing episode_287.mp3 ...




Done with episode_287.mp3. Segments: 458
Transcribing episode_295.mp3 ...




Done with episode_295.mp3. Segments: 407
Transcribing episode_297.mp3 ...




Done with episode_297.mp3. Segments: 565
Transcribing episode_282.mp3 ...




Done with episode_282.mp3. Segments: 446
Segment [15.92 - 18.46]:  Welcome to Practical AI.
  Word:  Welcome [15.92 - 16.76]
  Word:  to [16.76 - 17.36]
  Word:  Practical [17.36 - 18.18]
  Word:  AI. [18.18 - 18.46]
Segment [19.18 - 26.62]:  If you work in artificial intelligence, aspire to, or are curious how AI-related tech is
  Word:  If [19.18 - 19.54]
  Word:  you [19.54 - 19.72]
  Word:  work [19.72 - 19.94]
  Word:  in [19.94 - 20.22]
  Word:  artificial [20.22 - 20.54]
  Word:  intelligence, [20.54 - 21.30]
  Word:  aspire [21.76 - 22.18]
  Word:  to, [22.18 - 22.54]
  Word:  or [23.12 - 23.36]
  Word:  are [23.36 - 23.60]
  Word:  curious [23.60 - 24.14]
  Word:  how [24.14 - 24.96]
  Word:  AI [24.96 - 25.40]
  Word: -related [25.40 - 25.86]
  Word:  tech [25.86 - 26.24]
  Word:  is [26.24 - 26.62]
Segment [26.62 - 32.54]:  changing the world, this is the show for you. Thank you to our partners at fly.io,
  Word:  changing [26.62 - 26.96]
  Word:  the [26.96 - 27.12]
  Word:

### Align Audio and Transcripts

In [6]:


import difflib

def find_best_match(parsed_text, whisper_segments):

    best_ratio = 0
    best_segment = None
    for segment in whisper_segments:
        ratio = difflib.SequenceMatcher(None, parsed_text.lower(), segment['text'].lower()).ratio()
        if ratio > best_ratio:
            best_ratio = ratio
            best_segment = segment
    return best_segment

def align_parsed_with_whisper(parsed_segments, whisper_segments):
    aligned = []
    for pseg in parsed_segments:
        best_match = find_best_match(pseg['text'], whisper_segments)
        if best_match:
            aligned.append({
                'speaker': pseg['speaker'],
                'text': pseg['text'],
                'start': best_match['start'],
                'end': best_match['end']
            })
        else:

            aligned.append({
                'speaker': pseg['speaker'],
                'text': pseg['text'],
                'start': None,
                'end': None
            })
    return aligned

parsed_segments = all_transcripts_parsed[first_file.replace(".mp3", ".md")]  
whisper_segments = all_transcriptions[first_file]['segments']

aligned_segments = align_parsed_with_whisper(parsed_segments, whisper_segments)

for seg in aligned_segments[:5]:
    print(f"Speaker: {seg['speaker']}, Start: {seg['start']}, End: {seg['end']}")
    print(f"Text: {seg['text']}\n")


Speaker: Daniel Whitenack, Start: 284.64, End: 291.12
Text: Welcome to another Fully Connected episode of the Practical AI podcast. This is Daniel Whitenack. I am the CEO and founder at Prediction Guard, and I'm joined by my co-host, Chris Benson, who is a principal AI research engineer at Lockheed Martin. In these Fully Connected episodes we try to keep you fully connected with everything that's happening in the machine learning and data science and AI worlds, and hopefully share some things with you that will help you level up your machine learning and AI game. How are you doing, Chris? It'll be fun to catch up on a few things that have been happening over the past couple weeks today.

Speaker: Chris Benson, Start: 302.18, End: 308.0
Text: There's so much going on, oh my gosh.

Speaker: Daniel Whitenack, Start: 491.08, End: 492.8
Text: Always, yes.

Speaker: Chris Benson, Start: 302.18, End: 308.0
Text: We'll have to pick and choose what we have time to hit here.

Speaker: Daniel Whi

### Data Segments to feed the model

audio_folder_path = "/Users/kiran/Desktop/TDM/Sample Train Set /Audio"
base_output_dir = "/Users/kiran/Desktop/TDM/Sample Train Set /Chunks"

def build_aligned_segments_dict(all_parsed_transcripts, all_whisper_transcriptions):
    aligned_segments_dict = {}

    for audio_filename_with_ext, parsed_segments in all_parsed_transcripts.items():

        audio_filename = audio_filename_with_ext.replace(".md", ".mp3")
        if audio_filename not in all_whisper_transcriptions:
            print(f"Audio file {audio_filename} not found in transcriptions.")
            continue

        whisper_segments = all_whisper_transcriptions[audio_filename]['segments']
        
        aligned_segments = []
        for pseg in parsed_segments:
            # Basic string match with best matching whisper segment to assign timestamp
            best_match = None
            best_ratio = 0
            import difflib
            for wseg in whisper_segments:
                ratio = difflib.SequenceMatcher(None, pseg['text'].lower(), wseg['text'].lower()).ratio()
                if ratio > best_ratio:
                    best_ratio = ratio
                    best_match = wseg
            
            if best_match:
                aligned_segments.append({
                    "speaker": pseg["speaker"],
                    "text": pseg["text"],
                    "start": best_match["start"],
                    "end": best_match["end"],
                })
            else:

                aligned_segments.append({
                    "speaker": pseg["speaker"],
                    "text": pseg["text"],
                    "start": None,
                    "end": None,
                })
        aligned_segments_dict[audio_filename] = aligned_segments

    return aligned_segments_dict


aligned_segments_dict = build_aligned_segments_dict(all_transcripts_parsed, all_transcriptions)


chunk_multiple_files(audio_folder_path, aligned_segments_dict, base_output_dir)


import os
import wave
import contextlib
import csv

def get_audio_duration(filepath):
    with contextlib.closing(wave.open(filepath,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
    return duration

def prepare_yourtts_metadata(chunks_base_dir, output_csv_path):
    rows = []
    # Walk through all chunk folders and gather wav files and metadata.csv
    for episode_folder in os.listdir(chunks_base_dir):
        ep_path = os.path.join(chunks_base_dir, episode_folder)
        if not os.path.isdir(ep_path):
            continue

        metadata_file = os.path.join(ep_path, "metadata.csv")
        if not os.path.exists(metadata_file):
            print(f"Warning: metadata.csv not found in {ep_path}")
            continue
        
        with open(metadata_file, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                wav_path = row["file"]
                transcript = row["text"]
                speaker = row["speaker"]
                duration = get_audio_duration(wav_path)
                # YourTTS metadata format
                rows.append(f"{wav_path}|{duration:.2f}|{transcript}|{speaker}")

    # Save combined metadata
    with open(output_csv_path, "w", encoding='utf-8') as f:
        for line in rows:
            f.write(line + "\n")
    print(f"YourTTS metadata saved to {output_csv_path}")

# Example usage:
chunks_folder = "/Users/kiran/Desktop/TDM/Sample Train Set /Chunks"
yourtts_metadata_path = "/Users/kiran/Desktop/TDM/yourtts_metadata.csv"
prepare_yourtts_metadata(chunks_folder, yourtts_metadata_path)


git clone https://github.com/freds0/YourTTS.git
cd YourTTS


import os
os.chdir("YourTTS")

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
Obtaining file:///Users/kiran/Desktop/TDM/YourTTS
[31mERROR: file:///Users/kiran/Desktop/TDM/YourTTS does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m