In [2]:
!pip install PyPDF2 transformers matplotlib

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [8]:
!pip install nest_asyncio



In [12]:
from pathlib import Path
import json
import logging
import asyncio
from PyPDF2 import PdfReader
from transformers import pipeline
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(filename='logs/pipeline.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load summarization model globally
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

async def extract_text(pdf_path: str) -> str:
    """Extract text from a PDF file using PyPDF2."""
    try:
        print(f"Extracting text from {pdf_path}...")
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        if not text.strip():
            raise ValueError("No text extracted from the PDF. The PDF might be scanned or contain no text.")
        print(f"Text extraction completed for {pdf_path}.")
        return text.strip()
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {e}")
        raise

async def generate_summary(text: str) -> str:
    """Generate a summary of the text using a pre-trained model."""
    try:
        print("Generating summary...")
        # Check if the text is too short
        if len(text.split()) < 10:  # Skip summarization for very short text
            return text  # Return the original text as the summary

        # Chunk the text if it's too long
        max_tokens = 1024  # Maximum tokens for the model
        chunks = [text[i:i + max_tokens] for i in range(0, len(text), max_tokens)]

        summaries = []
        for chunk in chunks:
            summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
            summaries.append(summary)

        return " ".join(summaries)
    except Exception as e:
        logging.error(f"Error generating summary: {e}")
        raise

async def generate_graphical_abstract(summary: str, output_path: str):
    """Generate a graphical abstract using matplotlib."""
    try:
        print("Creating graphical abstract...")
        plt.figure(figsize=(10, 6))
        plt.text(0.1, 0.5, summary, fontsize=12, wrap=True)
        plt.axis('off')  # Hide axes
        plt.savefig(output_path, bbox_inches='tight', pad_inches=0.1)
        plt.close()
        print(f"Graphical abstract saved to {output_path}.")
    except Exception as e:
        logging.error(f"Error generating graphical abstract: {e}")
        raise

async def process_paper(pdf_path: str, output_dir: str) -> dict:
    results = {}
    try:
        # Validate input
        if not Path(pdf_path).is_file():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        # Create output directory
        paper_name = Path(pdf_path).stem
        output_path = Path(output_dir) / paper_name
        output_path.mkdir(parents=True, exist_ok=True)

        # Step 1: Extract text
        logging.info(f"Extracting text from {pdf_path}")
        extracted_text = await extract_text(pdf_path)
        results['extracted_text'] = extracted_text

        # Step 2: Generate summary
        logging.info("Generating summary")
        summary = await generate_summary(extracted_text)
        results['summary'] = summary

        # Step 3: Create graphical abstract
        logging.info("Creating graphical abstract")
        image_path = output_path / 'summary.png'
        await generate_graphical_abstract(summary, str(image_path))
        results['image_path'] = str(image_path)

        # Save summary to JSON
        summary_path = output_path / 'summary.json'
        with open(summary_path, 'w') as json_file:
            json.dump({'summary': summary}, json_file)
        results['summary_path'] = str(summary_path)

        logging.info(f"Processing completed for {pdf_path}")

    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        results['error'] = str(e)

    return results

async def main(pdf_paths: list, output_dir: str):
    tasks = [process_paper(pdf_path, output_dir) for pdf_path in pdf_paths]
    return await asyncio.gather(*tasks)

if __name__ == "__main__":
    # Get user input for PDF paths
    pdf_files = input("Enter the paths to the PDF files (comma-separated): ").strip().split(',')
    pdf_files = [path.strip() for path in pdf_files]  # Clean up paths

    # Get user input for output directory
    output_directory = input("Enter the output directory: ").strip()

    # Run the pipeline
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError as e:
        if str(e).startswith("There is no current event loop"):
            # Create a new event loop if one doesn't exist
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
        else:
            raise

    if loop.is_running():
        # If the loop is already running (e.g., in Jupyter Notebook), use nest_asyncio
        try:
            import nest_asyncio
            nest_asyncio.apply()
        except ImportError:
            print("Please install nest_asyncio to run this in an environment with an existing event loop.")
            print("Run: pip install nest_asyncio")
            exit(1)

    # Run the main function
    results = loop.run_until_complete(main(pdf_files, output_directory))
    print("Processing completed. Check the output directory for results.")

Device set to use cpu


Enter the paths to the PDF files (comma-separated): /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Generative_AI_Chatbots.pdf, /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Implementation_of_Retrieval-Augmented_Generation_RAG_in_Chatbot_Systems_for_Enhanced_Real-Time_Customer_Support_in_E-Commerce.pdf
Enter the output directory: /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/pdf-processing-pipeline/output
Extracting text from /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Generative_AI_Chatbots.pdf...
Text extraction completed for /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Gen

  self.source = Source(update_ids_from_dap=update_ids_from_dap, **source) if source.__class__ !=  Source else source


Creating graphical abstract...
Graphical abstract saved to /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/pdf-processing-pipeline/output/Implementation_of_Retrieval-Augmented_Generation_RAG_in_Chatbot_Systems_for_Enhanced_Real-Time_Customer_Support_in_E-Commerce/summary.png.
Processing completed. Check the output directory for results.


In [11]:
from pathlib import Path
import json
import logging
import asyncio
from PyPDF2 import PdfReader
from transformers import pipeline
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(filename='logs/pipeline.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load summarization model globally
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

async def extract_text(pdf_path: str) -> str:
    """Extract text from a PDF file using PyPDF2."""
    try:
        print(f"Extracting text from {pdf_path}...")
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        if not text.strip():
            raise ValueError("No text extracted from the PDF. The PDF might be scanned or contain no text.")
        print(f"Text extraction completed for {pdf_path}.")
        return text.strip()
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {e}")
        raise

async def generate_summary(text: str) -> str:
    """Generate a summary of the text using a pre-trained model."""
    try:
        print("Generating summary...")
        summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
        print("Summary generation completed.")
        return summary
    except Exception as e:
        logging.error(f"Error generating summary: {e}")
        raise

async def generate_graphical_abstract(summary: str, output_path: str):
    """Generate a graphical abstract using matplotlib."""
    try:
        print("Creating graphical abstract...")
        plt.figure(figsize=(10, 6))
        plt.text(0.1, 0.5, summary, fontsize=12, wrap=True)
        plt.axis('off')  # Hide axes
        plt.savefig(output_path, bbox_inches='tight', pad_inches=0.1)
        plt.close()
        print(f"Graphical abstract saved to {output_path}.")
    except Exception as e:
        logging.error(f"Error generating graphical abstract: {e}")
        raise

async def process_paper(pdf_path: str, output_dir: str) -> dict:
    results = {}
    try:
        # Validate input
        if not Path(pdf_path).is_file():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        # Create output directory
        paper_name = Path(pdf_path).stem
        output_path = Path(output_dir) / paper_name
        output_path.mkdir(parents=True, exist_ok=True)

        # Step 1: Extract text
        logging.info(f"Extracting text from {pdf_path}")
        extracted_text = await extract_text(pdf_path)
        results['extracted_text'] = extracted_text

        # Step 2: Generate summary
        logging.info("Generating summary")
        summary = await generate_summary(extracted_text)
        results['summary'] = summary

        # Step 3: Create graphical abstract
        logging.info("Creating graphical abstract")
        image_path = output_path / 'summary.png'
        await generate_graphical_abstract(summary, str(image_path))
        results['image_path'] = str(image_path)

        # Save summary to JSON
        summary_path = output_path / 'summary.json'
        with open(summary_path, 'w') as json_file:
            json.dump({'summary': summary}, json_file)
        results['summary_path'] = str(summary_path)

        logging.info(f"Processing completed for {pdf_path}")

    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        results['error'] = str(e)

    return results

async def main(pdf_paths: list, output_dir: str):
    tasks = [process_paper(pdf_path, output_dir) for pdf_path in pdf_paths]
    return await asyncio.gather(*tasks)

if __name__ == "__main__":
    # Get user input for PDF paths
    pdf_files = input("Enter the paths to the PDF files (comma-separated): ").strip().split(',')
    pdf_files = [path.strip() for path in pdf_files]  # Clean up paths

    # Get user input for output directory
    output_directory = input("Enter the output directory: ").strip()

    # Run the pipeline
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError as e:
        if str(e).startswith("There is no current event loop"):
            # Create a new event loop if one doesn't exist
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
        else:
            raise

    if loop.is_running():
        # If the loop is already running (e.g., in Jupyter Notebook), use nest_asyncio
        try:
            import nest_asyncio
            nest_asyncio.apply()
        except ImportError:
            print("Please install nest_asyncio to run this in an environment with an existing event loop.")
            print("Run: pip install nest_asyncio")
            exit(1)

    # Run the main function
    results = loop.run_until_complete(main(pdf_files, output_directory))
    print("Processing completed. Check the output directory for results.")

Device set to use cpu


Enter the paths to the PDF files (comma-separated): /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Generative_AI_Chatbots.pdf, /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Implementation_of_Retrieval-Augmented_Generation_RAG_in_Chatbot_Systems_for_Enhanced_Real-Time_Customer_Support_in_E-Commerce.pdf
Enter the output directory: /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/pdf-processing-pipeline/output
Extracting text from /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Generative_AI_Chatbots.pdf...
Text extraction completed for /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Gen

ERROR:root:Error generating summary: index out of range in self
ERROR:root:Error processing /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Generative_AI_Chatbots.pdf: index out of range in self


Extracting text from /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Implementation_of_Retrieval-Augmented_Generation_RAG_in_Chatbot_Systems_for_Enhanced_Real-Time_Customer_Support_in_E-Commerce.pdf...


ERROR:root:Error generating summary: index out of range in self
ERROR:root:Error processing /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Implementation_of_Retrieval-Augmented_Generation_RAG_in_Chatbot_Systems_for_Enhanced_Real-Time_Customer_Support_in_E-Commerce.pdf: index out of range in self


Text extraction completed for /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Implementation_of_Retrieval-Augmented_Generation_RAG_in_Chatbot_Systems_for_Enhanced_Real-Time_Customer_Support_in_E-Commerce.pdf.
Generating summary...
Processing completed. Check the output directory for results.


In [10]:
from pathlib import Path
import json
import logging
import asyncio
from PyPDF2 import PdfReader
from transformers import pipeline
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(filename='logs/pipeline.log', level=logging.INFO)

# Load summarization model globally
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

async def extract_text(pdf_path: str) -> str:
    """Extract text from a PDF file using PyPDF2."""
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text.strip()
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {e}")
        raise

async def generate_summary(text: str) -> str:
    """Generate a summary of the text using a pre-trained model."""
    try:
        # Summarize the text
        summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
        return summary
    except Exception as e:
        logging.error(f"Error generating summary: {e}")
        raise

async def generate_graphical_abstract(summary: str, output_path: str):
    """Generate a graphical abstract using matplotlib."""
    try:
        # Create a simple graphical abstract
        plt.figure(figsize=(10, 6))
        plt.text(0.1, 0.5, summary, fontsize=12, wrap=True)
        plt.axis('off')  # Hide axes
        plt.savefig(output_path, bbox_inches='tight', pad_inches=0.1)
        plt.close()
    except Exception as e:
        logging.error(f"Error generating graphical abstract: {e}")
        raise

async def process_paper(pdf_path: str, output_dir: str) -> dict:
    results = {}
    try:
        # Validate input
        if not Path(pdf_path).is_file():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        # Create output directory
        paper_name = Path(pdf_path).stem
        output_path = Path(output_dir) / paper_name
        output_path.mkdir(parents=True, exist_ok=True)

        # Step 1: Extract text
        logging.info(f"Extracting text from {pdf_path}")
        extracted_text = await extract_text(pdf_path)
        results['extracted_text'] = extracted_text

        # Step 2: Generate summary
        logging.info("Generating summary")
        summary = await generate_summary(extracted_text)
        results['summary'] = summary

        # Step 3: Create graphical abstract
        logging.info("Creating graphical abstract")
        image_path = output_path / 'summary.png'
        await generate_graphical_abstract(summary, str(image_path))
        results['image_path'] = str(image_path)

        # Save summary to JSON
        summary_path = output_path / 'summary.json'
        with open(summary_path, 'w') as json_file:
            json.dump({'summary': summary}, json_file)
        results['summary_path'] = str(summary_path)

        logging.info(f"Processing completed for {pdf_path}")

    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        results['error'] = str(e)

    return results

async def main(pdf_paths: list, output_dir: str):
    tasks = [process_paper(pdf_path, output_dir) for pdf_path in pdf_paths]
    return await asyncio.gather(*tasks)

if __name__ == "__main__":
    # Get user input for PDF paths
    pdf_files = input("Enter the paths to the PDF files (comma-separated): ").strip().split(',')
    pdf_files = [path.strip() for path in pdf_files]  # Clean up paths

    # Get user input for output directory
    output_directory = input("Enter the output directory: ").strip()

    # Run the pipeline
    try:
        loop = asyncio.get_event_loop()
    except RuntimeError as e:
        if str(e).startswith("There is no current event loop"):
            # Create a new event loop if one doesn't exist
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
        else:
            raise

    if loop.is_running():
        # If the loop is already running (e.g., in Jupyter Notebook), use nest_asyncio
        try:
            import nest_asyncio
            nest_asyncio.apply()
        except ImportError:
            print("Please install nest_asyncio to run this in an environment with an existing event loop.")
            print("Run: pip install nest_asyncio")
            exit(1)

    # Run the main function
    results = loop.run_until_complete(main(pdf_files, output_directory))
    print("Processing completed. Check the output directory for results.")

Device set to use cpu


Enter the paths to the PDF files (comma-separated): /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Generative_AI_Chatbots.pdf, /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Integrating_Generative_AI_for_Enhanced_Automation_in_System_Design_Processes.json
Enter the output directory: /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/pdf-processing-pipeline/output


ERROR:root:Error generating summary: index out of range in self
ERROR:root:Error processing /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Generative_AI_Chatbots.pdf: index out of range in self
ERROR:root:Error extracting text from /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Integrating_Generative_AI_for_Enhanced_Automation_in_System_Design_Processes.json: EOF marker not found
ERROR:root:Error processing /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Integrating_Generative_AI_for_Enhanced_Automation_in_System_Design_Processes.json: EOF marker not found


Processing completed. Check the output directory for results.


In [9]:
from pathlib import Path
import json
import logging
import asyncio
from PyPDF2 import PdfReader
from transformers import pipeline
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(filename='logs/pipeline.log', level=logging.INFO)

# Load summarization model globally
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

async def extract_text(pdf_path: str) -> str:
    """Extract text from a PDF file using PyPDF2."""
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text.strip()
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {e}")
        raise

async def generate_summary(text: str) -> str:
    """Generate a summary of the text using a pre-trained model."""
    try:
        # Summarize the text
        summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
        return summary
    except Exception as e:
        logging.error(f"Error generating summary: {e}")
        raise

async def generate_graphical_abstract(summary: str, output_path: str):
    """Generate a graphical abstract using matplotlib."""
    try:
        # Create a simple graphical abstract
        plt.figure(figsize=(10, 6))
        plt.text(0.1, 0.5, summary, fontsize=12, wrap=True)
        plt.axis('off')  # Hide axes
        plt.savefig(output_path, bbox_inches='tight', pad_inches=0.1)
        plt.close()
    except Exception as e:
        logging.error(f"Error generating graphical abstract: {e}")
        raise

async def process_paper(pdf_path: str, output_dir: str) -> dict:
    results = {}
    try:
        # Validate input
        if not Path(pdf_path).is_file():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        # Create output directory
        paper_name = Path(pdf_path).stem
        output_path = Path(output_dir) / paper_name
        output_path.mkdir(parents=True, exist_ok=True)

        # Step 1: Extract text
        logging.info(f"Extracting text from {pdf_path}")
        extracted_text = await extract_text(pdf_path)
        results['extracted_text'] = extracted_text

        # Step 2: Generate summary
        logging.info("Generating summary")
        summary = await generate_summary(extracted_text)
        results['summary'] = summary

        # Step 3: Create graphical abstract
        logging.info("Creating graphical abstract")
        image_path = output_path / 'summary.png'
        await generate_graphical_abstract(summary, str(image_path))
        results['image_path'] = str(image_path)

        # Save summary to JSON
        summary_path = output_path / 'summary.json'
        with open(summary_path, 'w') as json_file:
            json.dump({'summary': summary}, json_file)
        results['summary_path'] = str(summary_path)

        logging.info(f"Processing completed for {pdf_path}")

    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        results['error'] = str(e)

    return results

async def main(pdf_paths: list, output_dir: str):
    tasks = [process_paper(pdf_path, output_dir) for pdf_path in pdf_paths]
    return await asyncio.gather(*tasks)

if __name__ == "__main__":
    # Get user input for PDF paths
    pdf_files = input("Enter the paths to the PDF files (comma-separated): ").strip().split(',')
    pdf_files = [path.strip() for path in pdf_files]  # Clean up paths

    # Get user input for output directory
    output_directory = input("Enter the output directory: ").strip()

    # Run the pipeline
    asyncio.get_event_loop().run_until_complete(main(pdf_files, output_directory))
    print("Processing completed. Check the output directory for results.")

Device set to use cpu


Enter the paths to the PDF files (comma-separated): /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Addressing_the_Productivity_Paradox_in_Healthcare_with_Retrieval_Augmented_Generative_AI_Chatbots.pdf, /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/MINeD Hackathon/Sample PDFs/Implementation_of_Retrieval-Augmented_Generation_RAG_in_Chatbot_Systems_for_Enhanced_Real-Time_Customer_Support_in_E-Commerce.pdf
Enter the output directory: /content/drive/MyDrive/MINeD-Hackathon-Cactus-Project/backend/pdf-processing-pipeline/output


RuntimeError: This event loop is already running

In [5]:
from pathlib import Path
import json
import logging
import asyncio
from PyPDF2 import PdfReader
from transformers import pipeline
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(filename='logs/pipeline.log', level=logging.INFO)

# Load summarization model globally
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

async def extract_text(pdf_path: str) -> str:
    """Extract text from a PDF file using PyPDF2."""
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text.strip()
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {e}")
        raise

async def generate_summary(text: str) -> str:
    """Generate a summary of the text using a pre-trained model."""
    try:
        # Summarize the text
        summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
        return summary
    except Exception as e:
        logging.error(f"Error generating summary: {e}")
        raise

async def generate_graphical_abstract(summary: str, output_path: str):
    """Generate a graphical abstract using matplotlib."""
    try:
        # Create a simple graphical abstract
        plt.figure(figsize=(10, 6))
        plt.text(0.1, 0.5, summary, fontsize=12, wrap=True)
        plt.axis('off')  # Hide axes
        plt.savefig(output_path, bbox_inches='tight', pad_inches=0.1)
        plt.close()
    except Exception as e:
        logging.error(f"Error generating graphical abstract: {e}")
        raise

async def process_paper(pdf_path: str, output_dir: str) -> dict:
    results = {}
    try:
        # Validate input
        if not Path(pdf_path).is_file():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        # Create output directory
        paper_name = Path(pdf_path).stem
        output_path = Path(output_dir) / paper_name
        output_path.mkdir(parents=True, exist_ok=True)

        # Step 1: Extract text
        logging.info(f"Extracting text from {pdf_path}")
        extracted_text = await extract_text(pdf_path)
        results['extracted_text'] = extracted_text

        # Step 2: Generate summary
        logging.info("Generating summary")
        summary = await generate_summary(extracted_text)
        results['summary'] = summary

        # Step 3: Create graphical abstract
        logging.info("Creating graphical abstract")
        image_path = output_path / 'summary.png'
        await generate_graphical_abstract(summary, str(image_path))
        results['image_path'] = str(image_path)

        # Save summary to JSON
        summary_path = output_path / 'summary.json'
        with open(summary_path, 'w') as json_file:
            json.dump({'summary': summary}, json_file)
        results['summary_path'] = str(summary_path)

        logging.info(f"Processing completed for {pdf_path}")

    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        results['error'] = str(e)

    return results

async def main(pdf_paths: list, output_dir: str):
    tasks = [process_paper(pdf_path, output_dir) for pdf_path in pdf_paths]
    return await asyncio.gather(*tasks)

if __name__ == "__main__":

    # Provide sample PDF paths and output directory
    pdf_paths = ["/path/to/your/file.pdf"]  # Replace with actual paths to your PDF files
    output_dir = ["output"]  # Replace with your desired output directory

    # Run the main function within the asyncio event loop
    asyncio.run(main(pdf_paths, output_dir))

Device set to use cpu


TypeError: main() missing 2 required positional arguments: 'pdf_paths' and 'output_dir'

In [3]:
from pathlib import Path
import json
import logging
import asyncio
from PyPDF2 import PdfReader
from transformers import pipeline
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(filename='logs/pipeline.log', level=logging.INFO)

# Load summarization model globally
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

async def extract_text(pdf_path: str) -> str:
    """Extract text from a PDF file using PyPDF2."""
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text.strip()
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {e}")
        raise

async def generate_summary(text: str) -> str:
    """Generate a summary of the text using a pre-trained model."""
    try:
        # Summarize the text
        summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
        return summary
    except Exception as e:
        logging.error(f"Error generating summary: {e}")
        raise

async def generate_graphical_abstract(summary: str, output_path: str):
    """Generate a graphical abstract using matplotlib."""
    try:
        # Create a simple graphical abstract
        plt.figure(figsize=(10, 6))
        plt.text(0.1, 0.5, summary, fontsize=12, wrap=True)
        plt.axis('off')  # Hide axes
        plt.savefig(output_path, bbox_inches='tight', pad_inches=0.1)
        plt.close()
    except Exception as e:
        logging.error(f"Error generating graphical abstract: {e}")
        raise

async def process_paper(pdf_path: str, output_dir: str) -> dict:
    results = {}
    try:
        # Validate input
        if not Path(pdf_path).is_file():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        # Create output directory
        paper_name = Path(pdf_path).stem
        output_path = Path(output_dir) / paper_name
        output_path.mkdir(parents=True, exist_ok=True)

        # Step 1: Extract text
        logging.info(f"Extracting text from {pdf_path}")
        extracted_text = await extract_text(pdf_path)
        results['extracted_text'] = extracted_text

        # Step 2: Generate summary
        logging.info("Generating summary")
        summary = await generate_summary(extracted_text)
        results['summary'] = summary

        # Step 3: Create graphical abstract
        logging.info("Creating graphical abstract")
        image_path = output_path / 'summary.png'
        await generate_graphical_abstract(summary, str(image_path))
        results['image_path'] = str(image_path)

        # Save summary to JSON
        summary_path = output_path / 'summary.json'
        with open(summary_path, 'w') as json_file:
            json.dump({'summary': summary}, json_file)
        results['summary_path'] = str(summary_path)

        logging.info(f"Processing completed for {pdf_path}")

    except Exception as e:
        logging.error(f"Error processing {pdf_path}: {e}")
        results['error'] = str(e)

    return results

async def main(pdf_paths: list, output_dir: str):
    tasks = [process_paper(pdf_path, output_dir) for pdf_path in pdf_paths]
    return await asyncio.gather(*tasks)

if __name__ == "__main__":

  main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


TypeError: main() missing 2 required positional arguments: 'pdf_paths' and 'output_dir'