In [1]:
!pip install PyPDF2 transformers pyttsx3 googletrans==4.0.0-rc1



In [2]:
!pip install tqdm



In [3]:
!pip install gtts



In [5]:
# Install required libraries
!pip install transformers torch soundfile datasets PyPDF2 googletrans==4.0.0-rc1

import sys
import os
import re
import torch
import soundfile as sf
from PyPDF2 import PdfReader
from transformers import pipeline
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
from googletrans import Translator
import numpy as np
from concurrent.futures import ThreadPoolExecutor

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load models and datasets once (caching for reuse)
def load_models():
    """Load and cache all models and datasets."""
    print("Loading models...")
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    translator = Translator()
    print("✓ Models loaded")
    return summarizer, processor, model, vocoder, embeddings_dataset, translator

# Load models and datasets globally
summarizer, processor, model, vocoder, embeddings_dataset, translator = load_models()

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text.strip()
    except Exception as e:
        raise RuntimeError(f"Failed to extract text from PDF: {str(e)}")

def preprocess_text(text):
    """Clean and preprocess the text for better TTS output."""
    # Remove special characters and symbols
    text = re.sub(r'[^\w\s.,!?]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def chunk_text(text, max_chars=600):
    """Split text into smaller chunks for TTS processing."""
    chunks = []
    current_chunk = ""
    for sentence in text.split(". "):  # Split by sentences
        if len(current_chunk) + len(sentence) + 2 <= max_chars:  # +2 for ". "
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

def calculate_summary_length(input_length):
    """Calculate appropriate max_length for summarization based on input length."""
    # Aim for a summary that's about 40-60% of the input length
    max_length = max(30, min(int(input_length * 0.6), 130))
    min_length = max(20, int(max_length * 0.4))
    return max_length, min_length

def summarize_text(text):
    """Summarize text using a pre-trained model with dynamic length handling."""
    if not text.strip():
        return "No text content to summarize."

    try:
        # Handle long texts by chunking and summarizing separately
        if len(text) > 1000:
            chunks = chunk_text(text)
            summaries = []
            for chunk in chunks:
                if chunk.strip():
                    # Calculate appropriate lengths for this chunk
                    input_length = len(chunk.split())
                    max_length, min_length = calculate_summary_length(input_length)

                    summary = summarizer(chunk,
                                      max_length=max_length,
                                      min_length=min_length,
                                      do_sample=False,
                                      truncation=True)[0]['summary_text']
                    summaries.append(summary)
            return " ".join(summaries)
        else:
            # Calculate appropriate lengths for the entire text
            input_length = len(text.split())
            max_length, min_length = calculate_summary_length(input_length)

            summary = summarizer(text,
                               max_length=max_length,
                               min_length=min_length,
                               do_sample=False,
                               truncation=True)[0]['summary_text']
            return summary
    except Exception as e:
        raise RuntimeError(f"Failed to summarize the text: {str(e)}")

def generate_voiceover(text, language='en'):
    """Generate voiceover from text using a pre-trained TTS model from Hugging Face."""
    if not text.strip():
        raise ValueError("No text provided for voiceover generation")

    # Preprocess the text
    text = preprocess_text(text)
    print("Processed Text for TTS:", text)  # Debugging output

    try:
        # Load a speaker embedding (e.g., for English)
        speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)

        # Split the text into smaller chunks
        chunks = chunk_text(text, max_chars=600)  # Increased chunk size
        print(f"Number of chunks: {len(chunks)}")  # Debugging output

        # Generate speech for each chunk
        speech_chunks = []
        for i, chunk in enumerate(chunks):
            print(f"Processing chunk {i + 1}/{len(chunks)}...")
            inputs = processor(text=chunk, return_tensors="pt", truncation=True, max_length=600)
            inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the correct device
            with torch.no_grad():
                speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
            speech_chunks.append(speech.cpu().numpy())

        # Concatenate the speech chunks
        speech = np.concatenate(speech_chunks)

        # Save the audio to a file
        sf.write("output.wav", speech, samplerate=16000)
        print("✓ Voiceover generated using Hugging Face TTS model")
    except Exception as e:
        raise RuntimeError(f"Failed to generate voiceover: {str(e)}")

def translate_text(text, target_language='es'):
    """Translate text to target language."""
    if not text.strip():
        return text

    try:
        translation = translator.translate(text, dest=target_language)
        return translation.text
    except Exception as e:
        raise RuntimeError(f"Failed to translate text: {str(e)}")

def main():
    try:
        # Get inputs with better error handling
        pdf_path = input("Enter the path to the PDF file: ").strip('"').strip()
        if not pdf_path:
            raise ValueError("PDF path cannot be empty")

        language_code = input("Enter the language code (e.g., 'en', 'es'): ").strip().lower()
        if not language_code:
            language_code = 'en'  # Default to English

        # Validate PDF file
        if not os.path.isfile(pdf_path):
            raise ValueError(f"The path '{pdf_path}' is not a valid file")
        if not pdf_path.lower().endswith('.pdf'):
            raise ValueError(f"The file '{pdf_path}' is not a PDF file")

        # Process the PDF
        print("\nProcessing PDF...")
        extracted_text = extract_text_from_pdf(pdf_path)
        print("✓ Text extraction complete")
        print("Extracted Text Preview:", extracted_text[:500])  # Debugging output

        print("\nGenerating summary...")
        summary = summarize_text(extracted_text)
        print("✓ Summary complete:")
        print("-" * 50)
        print(summary)
        print("-" * 50)

        # Handle translation if needed
        if language_code != 'en':
            print(f"\nTranslating to {language_code}...")
            summary = translate_text(summary, language_code)
            print("✓ Translation complete:")
            print("-" * 50)
            print(summary)
            print("-" * 50)

        print("\nGenerating audio...")
        generate_voiceover(summary, language_code)
        print("✓ Audio generation complete - saved as 'output.wav'")

    except Exception as e:
        print(f"\nError: {str(e)}")
        sys.exit(1)

if __name__ == "__main__":
    main()

Using device: cuda
Loading models...


Device set to use cuda:0


✓ Models loaded
Enter the path to the PDF file: /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Generative_AI_Chatbots.pdf
Enter the language code (e.g., 'en', 'es'): en


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Processing PDF...
✓ Text extraction complete
Extracted Text Preview: Addressing the Productivity Paradox in Healthcare
with Retrieval Augmented Generative AI Chatbots
Sajani Ranasinghe∗, Daswin De Silva∗, Nishan Mills∗,
Damminda Alahakoon∗, Milos Manic‡, Yen Lim†, Weranja Ranasinghe†
∗Centre for Data Analytics and Cognition, La Trobe University, Melbourne, Australia
†Department of Urology, Monash Health, Melbourne, Australia
‡Department of Computer Science, Virginia Commonwealth University, Richmond, USA
Abstract —Artificial Intelligence (AI) is reshaping the hea

Generating summary...


Your max_length is set to 30, but your input_length is only 26. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=13)


✓ Summary complete:
--------------------------------------------------
Artificial Intelligence (AI) is reshaping the health-care landscape through diverse innovations, personalisations and decision-making capabilities. Addressing the Productivity Paradox in Healthcare with Retrieval Aug The human-like intelligence of Gen-erative AI has been fundamental in driving this transformation. Despite large investments and some early successes, several studies have signalled the emergence of a productivity paradox. We present the Retrieval Augmented GenerativeAI Chatbot framework for consultation summaries, diagnostic insights, and emotional assessments of patients. We further demonstrate the technical value of this framework in Generative Artificial Intelligence (AI) is transforming healthcare. It is leading a paradigm shift in the application of AI across its spectrum of functions. State-of-the-art models have introduced innovativeappro Genera-tesquetive AI is not without significant limitatio