<a href="https://colab.research.google.com/github/Jovino-dev/MicrosoftLearnToAudio/blob/main/TextProcessor_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Audio extraction from Microsoft learn module
This notebook allows to convert a full Microsoft learn module to audio

In [14]:
#@ Inputs para argumentos principales
definir_inputs = True
try:
    from google.colab import widgets
    from google.colab import output as colab_output
    from google.colab import files
    from IPython.display import display
    import ipywidgets as widgets
except ImportError:
    definir_inputs = False

if definir_inputs:
    url = widgets.Text(
        value='',
        placeholder='Pega la URL del módulo de Microsoft Learn',
        description='URL:',
        disabled=False
    )
    language = widgets.Text(value='es', description='Idioma:')
    speed = widgets.FloatSlider(value=1.0, min=0.5, max=2.0, step=0.1, description='Velocidad:')
    voice = widgets.Dropdown(options=['online', 'offline'], value='online', description='Voz:')
    display(url, language, speed, voice)
else:
    url = input('URL del módulo: ')
    language = input('Idioma (ej: es): ')
    speed = float(input('Velocidad (0.5-2.0): '))
    voice = input('Voz (online/offline): ')


Text(value='', description='URL:', placeholder='Pega la URL del módulo de Microsoft Learn')

Text(value='es', description='Idioma:')

FloatSlider(value=1.0, description='Velocidad:', max=2.0, min=0.5)

Dropdown(description='Voz:', options=('online', 'offline'), value='online')

In [13]:
# Importar librerías necesarias
import re
from typing import List


# Definir la clase TextProcessor
class TextProcessor:
    def __init__(self):
        self.cleanup_patterns = [
            (r'\s+', ' '),
            (r'\n\s*\n\s*\n+', '\n\n'),
            (r'[^\w\s\-.,;:!?¡¿áéíóúñüÁÉÍÓÚÑÜ()[\]"\'/]', ''),
            (r'\.{2,}', '.'),
            (r'\?{2,}', '?'),
            (r'!{2,}', '!'),
        ]
        self.filter_phrases = [
            'skip to main content', 'breadcrumb navigation', 'table of contents',
            'in this article', 'next steps', 'feedback', 'was this page helpful',
            'submit and view feedback', 'microsoft learn', 'sign in', 'search',
            'browse', 'theme', 'light', 'dark', 'high contrast', 'previous unit',
            'next unit', 'completed', 'check your knowledge', 'knowledge check'
        ]

    def clean_and_structure(self, raw_text):
        if not raw_text:
            return ""
        text = self._basic_cleanup(raw_text)
        text = self._filter_unwanted_content(text)
        text = self._structure_for_audio(text)
        text = self._normalize_spacing(text)
        return text.strip()

    def _basic_cleanup(self, text):
        for pattern, replacement in self.cleanup_patterns:
            text = re.sub(pattern, replacement, text)
        return text

    def _filter_unwanted_content(self, text):
        lines = text.split('\n')
        filtered_lines = []
        for line in lines:
            line = line.strip()
            line_lower = line.lower()
            if len(line) < 3:
                continue
            should_filter = False
            for phrase in self.filter_phrases:
                if phrase in line_lower:
                    should_filter = True
                    break
            if (line_lower.startswith(('http', 'www.', 'mailto:')) or
                line_lower.endswith(('min', 'sec', 'hr')) or
                re.match(r'^\d+\s*(min|sec|hr|minute|second|hour)', line_lower) or
                re.match(r'^(step \d+|unit \d+|\d+\.)$', line_lower)):
                should_filter = True
            if not should_filter:
                filtered_lines.append(line)
        return '\n'.join(filtered_lines)

    def _structure_for_audio(self, text):
        lines = text.split('\n')
        structured_lines = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            if self._is_heading(line):
                structured_lines.append(f"{line}.")
                structured_lines.append("")
            else:
                if line and not line[-1] in '.!?':
                    line += '.'
                structured_lines.append(line)
        return '\n'.join(structured_lines)

    def _is_heading(self, line):
        return (len(line) < 100 and
                not line.endswith(('.', '!', '?', ',', ';', ':')) and
                (line.isupper() or line.istitle() or
                 any(word.isupper() for word in line.split())))

    def _normalize_spacing(self, text):
        text = re.sub(r' +', ' ', text)
        text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
        lines = [line.strip() for line in text.split('\n')]
        return '\n'.join(lines)

    def split_into_chunks(self, text, max_chunk_size=4000):
        if len(text) <= max_chunk_size:
            return [text]
        chunks = []
        sentences = re.split(r'[.!?]+\s+', text)
        current_chunk = ""
        for sentence in sentences:
            if len(current_chunk) + len(sentence) > max_chunk_size:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence
            else:
                if current_chunk:
                    current_chunk += ". " + sentence
                else:
                    current_chunk = sentence
        if current_chunk:
            chunks.append(current_chunk.strip())
        return chunks

  # Inputs para argumentos principales
definir_inputs = True
try:
    from google.colab import widgets
    from google.colab import output as colab_output
    from google.colab import files
    from IPython.display import display
    import ipywidgets as widgets
except ImportError:
    definir_inputs = False

if definir_inputs:
    url = widgets.Text(
        value='',
        placeholder='Pega la URL del módulo de Microsoft Learn',
        description='URL:',
        disabled=False
    )
    language = widgets.Text(value='es', description='Idioma:')
    speed = widgets.FloatSlider(value=1.0, min=0.5, max=2.0, step=0.1, description='Velocidad:')
    voice = widgets.Dropdown(options=['online', 'offline'], value='online', description='Voz:')
    display(url, language, speed, voice)
else:
    url = input('URL del módulo: ')
    language = input('Idioma (ej: es): ')
    speed = float(input('Velocidad (0.5-2.0): '))
    voice = input('Voz (online/offline): ')


Text(value='', description='URL:', placeholder='Pega la URL del módulo de Microsoft Learn')

Text(value='es', description='Idioma:')

FloatSlider(value=1.0, description='Velocidad:', max=2.0, min=0.5)

Dropdown(description='Voz:', options=('online', 'offline'), value='online')

## Parámetros de entrada
Introduce la URL del módulo, el idioma, la velocidad y el tipo de voz para el procesamiento.

## Descargar y procesar las unidades del módulo
Esta celda descarga las unidades, procesa el texto y genera los audios.

In [10]:
# Instalar dependencias necesarias para Colab
def instalar_dependencias():
    import sys
    !{sys.executable} -m pip install gtts pydub requests beautifulsoup4 pyttsx3
    !apt-get install -y ffmpeg espeak

instalar_dependencias()

import requests
from bs4 import BeautifulSoup
from gtts import gTTS
from pydub import AudioSegment
import os
import shutil
import zipfile
import sys

# Intentar importar pyttsx3 (offline)
try:
    import pyttsx3
    offline_tts_available = True
except ImportError:
    offline_tts_available = False

# Función para obtener enlaces de unidades (class='unit-title')
def get_units_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    units = []
    for a in soup.find_all('a', class_='unit-title', href=True):
        full_url = requests.compat.urljoin(url, a['href'])
        if full_url not in units:
            units.append(full_url)
    return units

# Función para extraer el título de la página
def extract_title(soup):
    title_elem = soup.select_one('h1[data-bi-name="page-title"], h1.title, h1, .page-title h1, [data-bi-name="page-title"]')
    if title_elem:
        return title_elem.get_text().strip()
    title_tag = soup.find('title')
    if title_tag:
        return title_tag.get_text().strip()
    return "modulo"

# Conversión a audio usando pyttsx3 (offline)
def text_to_audio_offline(text, output_path, language='es', speed=1.0):
    engine = pyttsx3.init()
    # Configurar idioma/voz si es posible
    voices = engine.getProperty('voices')
    selected_voice = None
    for v in voices:
        if language in v.id or language in v.name.lower():
            selected_voice = v.id
            break
    if selected_voice:
        engine.setProperty('voice', selected_voice)
    engine.setProperty('rate', int(200 * speed))
    wav_path = output_path.replace('.mp3', '.wav')
    engine.save_to_file(text, wav_path)
    engine.runAndWait()
    # Convertir a mp3
    audio = AudioSegment.from_wav(wav_path)
    audio.export(output_path, format='mp3')
    os.remove(wav_path)

# Procesamiento principal
def procesar_modulo(url, language, speed, voice):
    units = get_units_links(url)
    if not units:
        print("No se encontraron unidades en el módulo.")
        return None
    print(f"{len(units)} unidades encontradas.")
    # Obtener nombre del módulo
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    module_title = extract_title(soup)
    safe_module_title = module_title.replace(' ', '_').replace('/', '_')
    output_dir = f"output_{safe_module_title}"
    os.makedirs(output_dir, exist_ok=True)
    for idx, unit_url in enumerate(units, 1):
        print(f"Procesando unidad {idx}: {unit_url}")
        unit_resp = requests.get(unit_url)
        unit_soup = BeautifulSoup(unit_resp.content, 'html.parser')
        unit_title = extract_title(unit_soup)
        safe_title = unit_title.replace(' ', '_').replace('/', '_')
        # Extraer texto principal
        main_content = unit_soup.select_one('main, .content, [role="main"], .main-content, article, .module-content')
        if not main_content:
            main_content = unit_soup
        text = main_content.get_text(separator='\n')
        processor = TextProcessor()
        processed_text = processor.clean_and_structure(text)
        # Convertir a audio
        audio_path = os.path.join(output_dir, f"unidad_{idx}-{safe_title}.mp3")
        if voice == 'offline' and offline_tts_available:
            text_to_audio_offline(processed_text, audio_path, language=language, speed=speed)
        else:
            tts = gTTS(text=processed_text, lang=language, slow=False)
            tts.save(audio_path)
        print(f"Audio guardado: {audio_path}")
    return output_dir

# Ejecutar procesamiento
dir_modulo = procesar_modulo(url.value if definir_inputs else url, language.value if definir_inputs else language, speed.value if definir_inputs else speed, voice.value if definir_inputs else voice)


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
espeak is already the newest version (1.48.15+dfsg-3).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
13 unidades encontradas.
Procesando unidad 1: https://learn.microsoft.com/es-es/training/modules/introduction-power-platform/1-introduction
Audio guardado: output_Describir_el_valor_empresarial_de_Microsoft_Power_Platform/unidad_1-Introducción.mp3
Procesando unidad 2: https://learn.microsoft.com/es-es/training/modules/introduction-power-platform/2-explore-microsoft-power-platform
Audio guardado: output_Describir_el_valor_empresarial_de_Microsoft_Power_Platform/unidad_2-Explorar_Microsoft_Power_Platform.mp3
Procesando unidad 3: https://learn.microsoft.com/es-es/training/modules/introduction-power-platform/3-describe-business-value-power-platform
Audio guardado: output_Describir_el_valor_empresarial_de_Microsoft

## Descargar el módulo como ZIP
Al finalizar, descarga todos los audios en un archivo comprimido.

In [9]:
# Crear y descargar ZIP del módulo
if dir_modulo:
    zip_path = f"output.zip"
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for root, _, files in os.walk(dir_modulo):
            for file in files:
                zipf.write(os.path.join(root, file), arcname=os.path.join(os.path.basename(dir_modulo), file))
    print(f"ZIP creado: {zip_path}")
    from google.colab import files
    files.download(zip_path)
else:
    print("No se generó ningún módulo para comprimir.")


ZIP creado: output_Describir_el_valor_empresarial_de_Microsoft_Power_Platform.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>