In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from transformers import pipeline
import nltk
from nltk.tokenize import word_tokenize
import time

# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

# Function to fetch paper content from a URL using Selenium
def fetch_paper(url):
    """
    Fetch and parse the research paper from a URL using Selenium.
    """
    try:
        # Set up the WebDriver (you may need to install a web driver)
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        driver.get(url)

        # Wait for the page to load
        driver.implicitly_wait(10)  # Wait up to 10 seconds for elements to load

        # You can increase the wait time if the page is slow or uses JS rendering
        time.sleep(5)  # Add additional wait for dynamic content to load

        # Get the page source after JavaScript has rendered the content
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract the content (assuming it's inside <article> tags or similar)
        content = ''
        for p in soup.find_all('p'):
            content += p.get_text()

        driver.quit()  # Close the browser after extracting content
        return content
    except Exception as e:
        print(f"Error fetching paper: {e}")
        return None

# Function to summarize the research paper
def summarize_text(text):
    """
    Summarizes the text using a pre-trained transformer model.
    """
    try:
        summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
        # Handle very large text by breaking it into smaller chunks
        chunk_size = 1000  # Process text in chunks
        text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        summary = ""
        for chunk in text_chunks:
            summary += summarizer(chunk, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
        return summary
    except Exception as e:
        print(f"Error during summarization: {e}")
        return None

# Function to extract keywords (using basic NLTK methods for simplicity)
def extract_keywords(text):
    """
    Extracts basic keywords (nouns) from the text.
    """
    try:
        words = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(words)
        keywords = [word for word, pos in pos_tags if pos in ['NN', 'NNS', 'NNP', 'NNPS']]
        return keywords
    except Exception as e:
        print(f"Error extracting keywords: {e}")
        return []

# Main function to execute the entire process
def process_paper(url):
    """
    Fetches the paper from the URL, summarizes it, and extracts keywords.
    """
    print("Fetching paper from the URL...")
    paper_content = fetch_paper(url)
    if not paper_content:
        print("Failed to fetch paper.")
        return

    print("Paper fetched. Summarizing...")
    summary = summarize_text(paper_content)
    if summary:
        print("\nSummary:")
        print(summary)
    else:
        print("Failed to summarize the paper.")

    print("\nExtracting keywords...")
    keywords = extract_keywords(paper_content)
    print("Keywords:", keywords)

# Example URL (replace with actual paper URL)
paper_url = "https://www.researchgate.net/publication/349470771_Using_Machine_Learning_for_Heart_Disease_Prediction"  # Replace with the URL of a real research paper

process_paper(paper_url)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Fetching paper from the URL...
Paper fetched. Summarizing...



Device set to use cpu
Your max_length is set to 200, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)



Summary:
CNN.com will feature iReporter photos in a weekly Travel Snapshots gallery. Please submit your best shots of the U.S. for next week. Visit CNN.com/Travel next Wednesday for a new gallery of snapshots. We'll feature the best shots from across the globe.

Extracting keywords...
Keywords: ['world', 'research']


In [6]:
import pdfplumber
from transformers import pipeline
import nltk
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

# Function to fetch paper content from a PDF using pdfplumber
def fetch_paper_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using pdfplumber.
    """
    try:
        content = ''
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                content += page.extract_text()
        return content
    except Exception as e:
        print(f"Error fetching paper from PDF: {e}")
        return None

# Function to summarize the text using a pre-trained transformer model
def summarize_text(text):
    """
    Summarizes the text using a pre-trained transformer model.
    """
    try:
        summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
        # Handle very large text by breaking it into smaller chunks
        chunk_size = 1000  # Process text in chunks
        text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
        summary = ""
        for chunk in text_chunks:
            summary += summarizer(chunk, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
        return summary
    except Exception as e:
        print(f"Error during summarization: {e}")
        return None

# Function to extract keywords (using basic NLTK methods for simplicity)
def extract_keywords(text):
    """
    Extracts basic keywords (nouns) from the text.
    """
    try:
        words = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(words)
        keywords = [word for word, pos in pos_tags if pos in ['NN', 'NNS', 'NNP', 'NNPS']]
        return keywords
    except Exception as e:
        print(f"Error extracting keywords: {e}")
        return []

# Main function to execute the entire process
def process_pdf(pdf_path):
    """
    Fetches the paper from the PDF, summarizes it, and extracts keywords.
    """
    print("Fetching paper from the PDF...")
    paper_content = fetch_paper_from_pdf(pdf_path)
    if not paper_content:
        print("Failed to fetch paper from the PDF.")
        return

    print("Paper fetched. Summarizing...")
    summary = summarize_text(paper_content)
    if summary:
        print("\nSummary:")
        print(summary)
    else:
        print("Failed to summarize the paper.")

    print("\nExtracting keywords...")
    keywords = extract_keywords(paper_content)
    print("Keywords:", keywords)

# Example file path (replace with the path to your downloaded PDF)
pdf_path = "heart_disease.pdf"  # Replace with your PDF file path

process_pdf(pdf_path)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Fetching paper from the PDF...
Paper fetched. Summarizing...


Device set to use cpu
Your max_length is set to 200, but your input_length is only 192. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 200, but your input_length is only 94. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)



Summary:
Edge-based Heart Disease Prediction using Federated Learning. Proceedings of 2024 International Conference on Cognitive Robotics and Intelligent Systems. IEEE Xplore Part Number: CFP24UD1-ART; ISBN: 979-8-3503-7274-8.Machine learning is being implemented to classify the severity of the heart disease. It is in the ratioof four in five cardiovascular deaths disease among the people. Methods like K-Nearest Algorithm (KNN), DecisionTree (DT) and Navie                produced from healthcare industries are being implemented.A shared model of federated learningthat makes its averaging process more efficient. Data mining method that collects all the data and store it in one place centrally. The data is from an updated version of LASSO (Least Absolute Shrinkage and Selection) algorithm.The proposed model has achieved 93.4% of accuracy levels by integrating the LASSO algorithm. Using federated averaging algorithm, the updated algorithm has been updated in recent years. The most diffic

In [4]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ----------- ---------------------------- 1.6/5.6 MB 8.4 MB/s eta 0:00:01
   ---------------------------- ----------- 3.9/5.6 MB 10.7 MB/s eta 0:00:01
   ---------------------------------------- 5.6/5.6 MB 10.7 MB/s eta 0:00:00
Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl (2.9 MB)
   ---------------------------------------- 0.0/2.9 MB ? eta -:--:--
   -------------------------------- ------- 2.4/2.9 MB 13.4 MB/s eta 0:00:01
   ---------------------------------------- 2.9/2.

In [None]:
!pip install tf-keras

