#ResearchReader

#ROLL NO :
C236
C258
C275

In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import spacy
import re
import PyPDF2
import numpy as np

def load_spacy_model(model_name='en_core_web_lg'):
    """Loads a spaCy model, downloading it if not found."""
    try:
        nlp = spacy.load(model_name)
    except OSError:
        print(f"Downloading spaCy model '{model_name}'. This may take a moment.")
        from spacy.cli import download
        download(model_name)
        nlp = spacy.load(model_name)
    return nlp

def extract_pdf_text(filepath):
    """Extracts raw text from a given PDF file."""
    try:
        with open(filepath, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text_parts = [page.extract_text() for page in reader.pages if page.extract_text()]
        full_text = " ".join(text_parts)
        return re.sub(r'\s+', ' ', full_text)
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
        return None

def cosine_similarity(vec1, vec2):
    """Calculates the cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def generate_summary(doc, num_sentences=3):
    """Generates a summary by finding sentences most similar to the document's overall meaning."""
    # Create a document vector from meaningful words
    keywords = [token.vector for token in doc if not token.is_stop and not token.is_punct and token.has_vector]
    if not keywords:
        return "Document is too short to summarize.", []

    doc_vector = np.mean(keywords, axis=0)

    # Score sentences based on similarity to the document vector
    sentences = list(doc.sents)
    sentence_scores = {sent: cosine_similarity(sent.vector, doc_vector) for sent in sentences if sent.has_vector}

    # Sort and select the top sentences
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    summary = " ".join([sent.text for sent in top_sentences])

    return summary, sentences

def find_answer(question_doc, sentences, relevance_threshold=0.6):
    """Finds the best sentence in a document to answer a question."""
    if not question_doc.has_vector:
        return "Could not understand the question."

    best_sentence = None
    max_sim = -1.0

    for sentence in sentences:
        if sentence.has_vector:
            sim = cosine_similarity(question_doc.vector, sentence.vector)
            if sim > max_sim:
                max_sim = sim
                best_sentence = sentence

    if max_sim > relevance_threshold and best_sentence:
        return best_sentence.text
    else:
        return "No confident answer found in the document."

def extract_section(text, section_title):
    """Extracts a specific section by its title using a flexible regex pattern."""
    # Pattern to find a heading, ignoring case and allowing for numbering
    pattern = re.compile(
        r'(?i)^\s*(\d+\.?\s*)?' + re.escape(section_title) + r'\s*?\n(.*?)(?=\n\s*\d+\.?\s+[A-Z]|\n\s*[A-Z]{2,})',
        re.DOTALL | re.MULTILINE
    )
    match = pattern.search(text)
    return match.group(2).strip() if match else f"Section '{section_title}' not found."

def display_menu():
    """Prints the main menu options to the console."""
    print("\n-- Menu --")
    print("1. Show Summary")
    print("2. Ask a Question")
    print("3. Extract Section (e.g., Introduction, Methods)")
    print("4. Exit")

def main():
    """Main application loop for the research paper analyzer."""
    nlp = load_spacy_model()

    pdf_path = input("Enter the path to the research paper PDF: ")
    raw_text = extract_pdf_text(pdf_path)
    if not raw_text:
        return

    print("Processing document...")
    doc = nlp(raw_text)

    try:
        summary_length = int(input("Enter number of sentences for summary: "))
    except ValueError:
        print("Invalid number. Defaulting to 3 sentences.")
        summary_length = 3

    summary, sentences = generate_summary(doc, summary_length)
    print("\n--- Document Summary ---")
    print(summary)

    while True:
        display_menu()
        choice = input("Your choice (1-4): ")

        if choice == '1':
            print("\n--- Document Summary ---")
            print(summary)
        elif choice == '2':
            question = input("Your question: ")
            question_doc = nlp(question)
            answer = find_answer(question_doc, sentences)
            print(f"\nAnswer: {answer}")
        elif choice == '3':
            section_name = input("Enter section title to extract: ")
            section_content = extract_section(raw_text, section_name.strip())
            print(f"\n--- Extracted: {section_name.title()} ---")
            print(section_content)
        elif choice == '4':
            print("Exiting.")
            break
        else:
            print("Invalid choice. Please enter a number from 1 to 4.")

if __name__ == "__main__":
    main()

Enter the path to the research paper PDF: /content/Introduction of Research Papers.pdf




Processing document...
Enter number of sentences for summary: 23

--- Document Summary ---
[Stating Research Focus and Objectives] This study thus fits within existing literature concerning the foundations and outcomes of indigenous forest and wildlife management. Education Most educational research papers involve critical analysis of teaching methods and pedagogical theory as they apply to real-life teaching situations. Most social science introductions follow the same structure outlined in this resource. Introductions for medical research papers should first broadly review relevant background information on the research topic, then narrow to a focused research question(s), thesis statement, and study objective. [Stating Research Question] That is, have the land-based values and practices of indigenous peoples in Wisconsin led to significantly dif ferent ecological Introduction Section for Research Papers, Winter 2023.6of8 conditions on their forestlands relative to neighboring lands?