<a href="https://colab.research.google.com/github/Jovino-dev/MicrosoftLearnToAudio/blob/main/TextProcessor_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Audio extraction from Microsoft learn module
This notebook allows to convert a full Microsoft learn module to audio

## Download and process al units from module
This cell download the units, process the text and generates the audios

In [None]:
# @title Execution {"run":"auto","vertical-output":true,"display-mode":"form"}
url = "https://learn.microsoft.com/es-es/training/modules/introduction-power-platform/" # @param {"type":"string"}
language = "es" # @param ["es","en"]
speed = 1.2 # @param {"type":"slider","min":1,"max":2,"step":0.1}
import re
from typing import List
import requests
from bs4 import BeautifulSoup
from gtts import gTTS
from pydub import AudioSegment
import os
import shutil
import zipfile
import sys

# Define the TextProcessor class for cleaning and structuring text.
class TextProcessor:
    def __init__(self):
        # Patterns for basic text cleanup.
        self.cleanup_patterns = [
            (r'\s+', ' '), # Reduce multiple spaces to a single space.
            (r'\n\s*\n\s*\n+', '\n\n'), # Reduce multiple empty lines to a single empty line.
            (r'[^\w\s\-.,;:!?¡¿áéíóúñüÁÉÍÓÚÑÜ()[\]"\'/]', ''), # Remove unwanted characters.
            (r'\.{2,}', '.'), # Reduce multiple periods to a single period.
            (r'\?{2,}', '?'), # Reduce multiple question marks to a single question mark.
            (r'!{2,}', '!'), # Reduce multiple exclamation marks to a single exclamation mark.
        ]
        # Phrases to filter out from the text.
        self.filter_phrases = [
            'skip to main content', 'breadcrumb navigation', 'table of contents',
            'in this article', 'next steps', 'feedback', 'was this page helpful',
            'submit and view feedback', 'microsoft learn', 'sign in', 'search',
            'browse', 'theme', 'light', 'dark', 'high contrast', 'previous unit',
            'next unit', 'completed', 'check your knowledge', 'knowledge check'
        ]

    def clean_and_structure(self, raw_text):
        # Main method to clean, filter, and structure the text.
        if not raw_text:
            return ""
        text = self._basic_cleanup(raw_text)
        text = self._filter_unwanted_content(text)
        text = self._structure_for_audio(text)
        text = self._normalize_spacing(text)
        return text.strip()

    def _basic_cleanup(self, text):
        # Apply basic cleanup patterns.
        for pattern, replacement in self.cleanup_patterns:
            text = re.sub(pattern, replacement, text)
        return text

    def _filter_unwanted_content(self, text):
        # Filter out unwanted lines based on phrases and patterns.
        lines = text.split('\n')
        filtered_lines = []
        for line in lines:
            line = line.strip()
            line_lower = line.lower()
            if len(line) < 3: # Skip very short lines.
                continue
            should_filter = False
            for phrase in self.filter_phrases:
                if phrase in line_lower:
                    should_filter = True
                    break
            # Filter lines matching specific patterns (URLs, time durations, step/unit numbers).
            if (line_lower.startswith(('http', 'www.', 'mailto:')) or
                line_lower.endswith(('min', 'sec', 'hr')) or
                re.match(r'^\d+\s*(min|sec|hr|minute|second|hour)', line_lower) or
                re.match(r'^(step \d+|unit \d+|\d+\.)$', line_lower)):
                should_filter = True
            if not should_filter:
                filtered_lines.append(line)
        return '\n'.join(filtered_lines)

    def _structure_for_audio(self, text):
        # Structure text for better audio reading (add periods, handle headings).
        lines = text.split('\n')
        structured_lines = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            if self._is_heading(line):
                structured_lines.append(f"{line}.") # Add period to headings.
                structured_lines.append("") # Add empty line after headings.
            else:
                if line and not line[-1] in '.!?':
                    line += '.' # Add period if line doesn't end with punctuation.
                structured_lines.append(line)
        return '\n'.join(structured_lines)

    def _is_heading(self, line):
        # Check if a line is likely a heading.
        return (len(line) < 100 and # Headings are usually not too long.
                not line.endswith(('.', '!', '?', ',', ';', ':')) and # Headings usually don't end with punctuation.
                (line.isupper() or line.istitle() or # Check for all caps or title case.
                 any(word.isupper() for word in line.split()))) # Check if any word is all caps.

    def _normalize_spacing(self, text):
        # Normalize spacing within and between lines.
        text = re.sub(r' +', ' ', text) # Replace multiple spaces with single space.
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # Replace multiple empty lines with single empty line.
        lines = [line.strip() for line in text.split('\n')]
        return '\n'.join(lines)

    def split_into_chunks(self, text, max_chunk_size=4000):
        # Split text into smaller chunks for TTS API limits.
        if len(text) <= max_chunk_size:
            return [text]
        chunks = []
        sentences = re.split(r'[.!?]+\s+', text) # Split text by sentence-ending punctuation.
        current_chunk = ""
        for sentence in sentences:
            # If adding the next sentence exceeds chunk size, start a new chunk.
            if len(current_chunk) + len(sentence) > max_chunk_size:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                # Add sentence to current chunk.
                if current_chunk:
                    current_chunk += ". " + sentence
                else:
                    current_chunk = sentence
        # Add the last chunk if it's not empty.
        if current_chunk:
            chunks.append(current_chunk.strip())
        return chunks

# Function to install necessary dependencies in Colab.
def instalar_dependencias():
    import sys
    !{sys.executable} -m pip install gtts pydub requests beautifulsoup4 pyttsx3
    !apt-get install -y ffmpeg espeak

# Install dependencies.
instalar_dependencias()

# Try importing pyttsx3 (offline TTS).
try:
    import pyttsx3
    offline_tts_available = True
except ImportError:
    offline_tts_available = False

# Function to get unit links from the module URL.
def get_units_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    units = []
    # Find all links with class 'unit-title'.
    for a in soup.find_all('a', class_='unit-title', href=True):
        full_url = requests.compat.urljoin(url, a['href'])
        if full_url not in units:
            units.append(full_url)
    return units

# Function to extract the title of the page.
def extract_title(soup):
    # Try different selectors to find the page title.
    title_elem = soup.select_one('h1[data-bi-name="page-title"], h1.title, h1, .page-title h1, [data-bi-name="page-title"]')
    if title_elem:
        return title_elem.get_text().strip()
    # Fallback to the HTML <title> tag.
    title_tag = soup.find('title')
    if title_tag:
        return title_tag.get_text().strip()
    return "modulo" # Default title if none found.

# Function to convert text to audio using pyttsx3 (offline).
def text_to_audio_offline(text, output_path, language='es', speed=1.0):
    engine = pyttsx3.init()
    # Configure language/voice if possible.
    voices = engine.getProperty('voices')
    selected_voice = None
    for v in voices:
        if language in v.id or language in v.name.lower():
            selected_voice = v.id
            break
    if selected_voice:
        engine.setProperty('voice', selected_voice)
    engine.setProperty('rate', int(200 * speed)) # Set speech speed.
    wav_path = output_path.replace('.mp3', '.wav')
    engine.save_to_file(text, wav_path) # Save audio as WAV.
    engine.runAndWait()
    # Convert WAV to MP3.
    audio = AudioSegment.from_wav(wav_path)
    audio.export(output_path, format='mp3')
    os.remove(wav_path) # Remove the temporary WAV file.

# Main processing function.
def procesar_modulo(url, language, speed):
    units = get_units_links(url)
    if not units:
        print("No se encontraron unidades en el módulo.")
        return None
    print(f"{len(units)} unidades encontradas.")
    # Get module title for output directory name.
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    module_title = extract_title(soup)
    safe_module_title = module_title.replace(' ', '_').replace('/', '_')
    output_dir = f"output_{safe_module_title}"
    os.makedirs(output_dir, exist_ok=True) # Create output directory.

    # Process each unit.
    for idx, unit_url in enumerate(units, 1):
        print(f"Procesando unidad {idx}: {unit_url}")
        # Limit to processing only the first unit for testing, remove in production.
        # if idx > 1:
        #     break
        unit_resp = requests.get(unit_url)
        unit_soup = BeautifulSoup(unit_resp.content, 'html.parser')
        unit_title = extract_title(unit_soup)
        safe_title = unit_title.replace(' ', '_').replace('/', '_')

        # Extract main content text.
        main_content = unit_soup.select_one('main, .content, [role="main"], .main-content, article, .module-content')
        if not main_content:
            main_content = unit_soup
        text = main_content.get_text(separator='\n')

        # Process and clean the extracted text.
        processor = TextProcessor()
        processed_text = processor.clean_and_structure(text)

        # Convert processed text to audio using gTTS (online).
        audio_path = os.path.join(output_dir, f"unidad_{idx}-{safe_title}.mp3")
        tts = gTTS(text=processed_text, lang=language, slow=False)
        tts.save(audio_path)
        print(f"Audio guardado: {audio_path}")

    return output_dir # Return the path to the output directory.

# Execute the main processing function with input values.
dir_modulo = procesar_modulo(url, language, speed)

## Download Zip
When conversion is finished, download zip file with audio contents

In [None]:
# @title Download {"display-mode":"form"}
# Check if a module directory was successfully created.
if dir_modulo:
    # Define the path for the output zip file.
    zip_path = f"output.zip"
    # Create a zip file containing the audio files.
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        # Walk through the output directory and add files to the zip.
        for root, _, files in os.walk(dir_modulo):
            for file in files:
                zipf.write(os.path.join(root, file), arcname=os.path.join(os.path.basename(dir_modulo), file))
    print(f"ZIP creado: {zip_path}")
    # Import the files module for downloading in Colab.
    from google.colab import files
    # Download the created zip file.
    files.download(zip_path)
else:
    # Print a message if no module was generated.
    print("No se generó ningún módulo para comprimir.")