<a href="https://colab.research.google.com/github/JoelYanotka/text-summary-to-speech/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import clear_output

In [None]:
!pip install git+https://github.com/huggingface/transformers sentencepiece datasets
clear_output()

In [None]:
!pip install wikipedia
clear_output()

In [None]:
!pip install num2words
clear_output()

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
clear_output()

In [None]:
import re

import wikipedia
from urllib.request import urlopen
from bs4 import BeautifulSoup

def summarize_page(wiki_title):
    '''
    Retrieves a Wikipedia page with the title given in the input, and extracts
    its sections and paragraphs. Then, it generates summaries of the paragraphs.
    Parameters:
        wiki_title: a string with the title of a Wikipedia page.
    Output:
        A list of strings with summaries of the sections of the Wikipedia page.
    '''
    # Get a Wikipedia page by its title
    wiki = wikipedia.page(wiki_title)

    # Get a Wikipedia page by its url and make a soup
    source = urlopen(wiki.url).read()
    soup = BeautifulSoup(source,'lxml')

    # page will contain the list of sections in the page, as delimited by html headlines
    # Add a list with the page title as the first element and the summary as the second
    page = [[soup.find('h1').get_text(), wiki.summary]]

    for header in soup.find_all(['h2', 'h3']):
        header_name = header.get_text().replace('[edit]', '')
        if header.get_text() == 'Contents':
            continue
        if 'References' in header.get_text():
            break
        # Every element in the section list will be a list with the name of the
        # headline as the first element and the paragraphs as the next elements
        section = [header_name]
        for elem in header.next_siblings:
            # Stop at next header
            if elem.name and elem.name.startswith('h'):
                break
            if elem.name == 'p':
                # re.sub eliminate references
                paragraph = re.sub(r'\[.*?\]+', '', elem.get_text())\
                    .replace('\n', '')\
                    .replace(u'\xa0', ' ')
                section.append(paragraph)
        page.append(section)

    print('The page is being summarized.')
    summaries = []
    for i, section in enumerate(page):
        if len(section) > 1:
            for j, paragraph in enumerate(section[1:]):
                lenght = len(paragraph.split())
                summary = summarizer(paragraph,
                                     max_length=lenght,
                                     min_length=2,
                                     do_sample=False)
                summaries.append(summary[0]['summary_text'])
                print(f'\rSection: {i+1}/{len(page)}\tParagraph: {j+1}/{len(section)}', end='')
    print('\rSummary successfully completed.')
    return summaries

Because the model does not convert numerical characters and abbreviations into speech, the following steps are required in order for the model to be able to pronounce numbers and measurements correctly.

In [None]:
import re
import num2words

def parse_numbers(text):
    '''
    Takes a string text as input, and returns a modified version of the input
    string where the numbers in the text have been converted to their textual
    form and measurement abbreviations have been replaced for their full name.
    '''
    # Common measurement abbreviations to replace for its full name
    measurements = {
    "mm": "milimeters",
    "cm": "centimeters",
    "m": "meters",
    "km": "kilometers",
    "mg": "milligrams",
    "g": "grams",
    "kg": "kilograms",
    "ml": "milliters",
    "l": "liters",
    "L": "liters",
    "in": "inches",
    "ft": "feet",
    "yd": "yards",
    "mi": "miles",
    "oz": "ounces",
    "lb": "pounds",
    "gal": "gallons"
}
    
    # Find numbers folowed by any of the previous measurement abbreviations and
    # replace the abbreviation for the full name
    pattern = r'(\d+)\s*(' + '|'.join(measurements.keys()) + r')\b'
    text = re.sub(pattern,
                  lambda m: m.group(1) + ' ' + measurements[m.group(2)],
                  text)

    # Eliminate the leading zeroes in a decimal number
    text = re.sub(r"(\.\d*?[1-9])0+\b", r"\1", text)

    # Find the numbers with comma separators and deletes the comma
    matches = re.findall(r"(\d+,\d+)", text)
    for match in matches:
        text = text.replace(match, match.replace(",", ""))

    # Find the decimal numbers and replaces the dot character for the word "point"
    matches = re.findall(r"(\d+.\d+)", text)
    for match in matches:
        text = text.replace(match, match.replace(".", " point "))

    # Replace numbers for words
    text = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), text)

    return text

Now the text is ready to be converted into audio.
Due to the fact that the text-to-speech model supports up to 600 words, it is neccesary to create many audio files and then combine them to create the final product.

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# load xvector containing speaker's voice characteristics from a dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

def text2audio(text,
               output_name='output.wav',
               processor=processor,
               model=model,
               vocoder=vocoder,
               embeddings_dataset=embeddings_dataset,
               speaker_embeddings=speaker_embeddings):
    '''
    Converts a given text into an audio file. The function uses a
    text-to-speech model to synthesize speech and a vocoder to convert the
    generated speech into an audio waveform.
    '''
    inputs = processor(text=text, return_tensors="pt")

    speech = model.generate_speech(inputs["input_ids"],
                                   speaker_embeddings,
                                   vocoder=vocoder)
    sf.write(output_name, speech.numpy(), samplerate=16000)



In [None]:
import os
import wave

def merge_audio(list_of_text, out_dir=''):
    '''
    Takes a list of text strings as input and outputs a merged audio file
    containing the spoken versions of the text.
    Parameters:
        list_of_text (List[str]): A list of text strings to be converted to
        speech and merged together.
        out_dir (str): The destination directory for the audio file.
        If not specified, it will be created in the current directory.
    Return:
        None. The merged audio file is saved in the specified directory.
    '''
    n_audios = len(list_of_text)
    for i, line in enumerate(list_of_text):
        text2audio(text=line, output_name=f'tmp_output_{i:03d}.wav')
        print(f'\r{(i+1)/n_audios*100:.0f} % of the audio file completed.', end='')
    print('')

    infiles = [f for f in os.listdir() if f.startswith('tmp_output_')]
    infiles.sort()
    outfile = os.path.join(os.getcwd(), out_dir, "output.wav")

    data= []
    for infile in infiles:
        w = wave.open(infile, 'rb')
        data.append([w.getparams(), w.readframes(w.getnframes())])
        w.close()
        
    output = wave.open(outfile, 'wb')
    output.setparams(data[0][0])
    for i in range(len(data)):
        output.writeframes(data[i][1])
    output.close()

    for f in infiles:
        os.remove(f)
    
    print(f'Process completed. File path:')
    print(f'{outfile}')

In [None]:
if __name__ == '__main__':
    wiki_title = input('Type the full title of a Wikipedia page: ')
    wiki_title = wiki_title.strip().replace(' ', '_')
    
    summarized_text = summarize_page(wiki_title)
    summarized_text = [parse_numbers(line) for line in summarized_text]

    merge_audio(summarized_text)