In [1]:
from langchain_ollama import ChatOllama # type: ignore
from langchain_groq import ChatGroq # type: ignore
from langchain.prompts import ChatPromptTemplate # type: ignore
from typing import List, Dict, Any
from typing_extensions import TypedDict # type: ignore
from langchain_core.output_parsers import StrOutputParser # type: ignore
from langgraph.graph import StateGraph, START, END # type: ignore
from IPython.display import Image, display # type: ignore
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage # type: ignore
from langchain_core.output_parsers import PydanticOutputParser # type: ignore
from langchain_text_splitters import TokenTextSplitter # type: ignore
import os
from dotenv import load_dotenv # type: ignore
from typing import List, Dict, Any, Optional
import fitz # type: ignore
from pydantic import BaseModel, Field # type: ignore
load_dotenv()

False

In [2]:
os.environ["LANGSMITH_PROJECT"] = f"MineD 2025"

In [3]:
llm = ChatGroq(model="llama-3.2-1b-preview", api_key="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")

In [4]:
#base model to hold the metadata, and slide summaries that the llm will extract
class ResPaperText(BaseModel):
    authors: str = Field(..., description="List of authors of the research paper")
    title: str = Field(..., description="Title of the research paper")
    submission_date: str = Field(..., description="Submission date of the research paper")
    keywords: List[str] = Field(..., description="List of keywords associated with the research paper")
    references: List[str] = Field(..., description="List of references cited in the research paper")
    abstract: str = Field(..., description="Abstract of the research paper")
    conclusion: str = Field(..., description="Conclusion of the research paper")
    body: List[str] = Field(..., description="Body content of the research paper, organized as a list of sections or paragraphs")

In [5]:
class ResPaperExtractState(TypedDict):
    pdf_path: Optional[str] = None  # Path to the PDF file
    extracted_text: Optional[str] = None  # Full extracted text from the PDF
    extracted_images: Optional[List[str]] = None  # Paths to extracted images
    slides_content: Optional[List[Dict[str, str]]] = None  # Prepared content for PowerPoint slides
    condensed_text: ResPaperText

In [6]:
def load_pdf(state: ResPaperExtractState):
    pdf_path = state["pdf_path"]
    doc = fitz.open(pdf_path)  # Load the PDF only once
    
    extracted_text = []
    extracted_images = []
    output_folder = "extracted_images"
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through each page
    for page_number, page in enumerate(doc):
        # Extract text
        text = page.get_text("text")
        extracted_text.append(text)

        # Extract images
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            img_filename = f"{output_folder}/page_{page_number+1}_img_{img_index+1}.{image_ext}"
            
            with open(img_filename, "wb") as img_file:
                img_file.write(image_bytes)
            
            extracted_images.append(img_filename)

    # Combine text from all pages
    full_text = "\n".join(extracted_text)

    # Update state
    return {"extracted_text": full_text, "extracted_images": extracted_images}

In [7]:
condenser_instruction = """ 
You are an AI assistant specialized in processing research papers. 

Here is the text extracted from a research paper: {extracted_text}

When tasked with extracting information from the provided text, follow these guidelines, and structure the content accordingly:
1. **Metadata Extraction:** Identify and extract:
   - Authors  
   - Title  
   - Submission Date  
   - Keywords  
   - References (return as a list) 

2. **Text Structuring:** Organize the content into:
   - Abstract  
   - Conclusion  
   - Body (as a list of sections or paragraphs)  

3. **Slide Summaries:** If applicable, condense key points into structured slide content, ensuring clarity and coherence.

Ensure the extracted content is well-structured, concise, and retains essential details.

"""
parser = PydanticOutputParser(pydantic_object=ResPaperText)

condenser_template = ChatPromptTemplate(
   messages=[("system", condenser_instruction),
   ("human", "Extract the details from the given text")],
   input_variables=["extracted_text"],
   partial_variables={"format_instructions": parser.get_format_instructions()},
)

def get_data(state: ResPaperExtractState):
    extracted_text = state["extracted_text"]
    structured_llm = llm.with_structured_output(ResPaperText)
    condenser_prompt = condenser_template.format(extracted_text=extracted_text)
    response = structured_llm.invoke(condenser_prompt)
    return {"metadata": response}

In [9]:
condenser_instruction = """ 
You are an AI assistant specialized in processing research papers. 

Here is the text extracted from a research paper: {extracted_text}

When tasked with extracting information from the provided text, follow these guidelines, and structure the content accordingly:
1. **Metadata Extraction:** Identify and extract:
   - Authors  
   - Title  
   - Submission Date  
   - Keywords  
   - References (return as a list) 

2. **Text Structuring:** Organize the content into:
   - Abstract  
   - Conclusion  
   - Body (as a list of sections or paragraphs)  

3. **Slide Summaries:** If applicable, condense key points into structured slide content, ensuring clarity and coherence.

Ensure the extracted content is well-structured, concise, and retains essential details.

"""
parser = PydanticOutputParser(pydantic_object=ResPaperText)

condenser_template = ChatPromptTemplate(
   messages=[("system", condenser_instruction),
   ("human", "Given are the extracted details from a research paper, try to make an ppt from the given extracted text, you can add some basic topic from the related format. {extracted_text}")],
   input_variables=["extracted_text"],
   partial_variables={"format_instructions": parser.get_format_instructions()},
)

def get_ppt_text(state: ResPaperExtractState):
    extracted_text = state["metadata"]
    structured_llm = llm.with_structured_output(ResPaperText)
    condenser_prompt = condenser_template.format(extracted_text=extracted_text)
    response = structured_llm.invoke(condenser_prompt)
    return {"metadata": response}

In [10]:
builder = StateGraph(ResPaperExtractState)

builder.add_node("pdf-2-text", load_pdf)
builder.add_node("text-condensation", get_data)
builder.add_node("make-ppt-text", get_ppt_text)

builder.add_edge(START, "pdf-2-text")
builder.add_edge("pdf-2-text", "text-condensation")
builder.add_edge("text-condensation", END)

graph = builder.compile()

In [None]:
state_output = graph.invoke({"pdf_path": "C:\\Users\\Mihir Patel\\Downloads\\1706.03762v7.pdf"})

APIStatusError: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.2-1b-preview` in organization `org_01jjve6bv7fhjsqmspe7a3fpmr` service tier `on_demand` on tokens per minute (TPM): Limit 7000, Requested 20071, please reduce your message size and try again. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

: 

In [47]:
for key, value in state_output.items():
    print(f"Node: {key}")
    if isinstance(value, str) or isinstance(value, list):
        print(value)
    else:
        print(state_output[key])

Node: pdf_path
C:\Users\milap\OneDrive\Desktop\CLG\3rd YR\SEM VI\mined_2025\lib\server\Milap_Tathya_ICC_June_2025.pdf
Node: extracted_text
ConvNeXt-based Multi-Class Hydrocarbon Spill
Classification in Hyperspectral Imagery
Milap Patel, Tathya Patel, Anuja Nair, Member, IEEE, Tarjni Vyas, Shivani Desai,
Sudeep Tanwar, Senior Member, IEEE
Department of Computer Science and Engineering, School of Technology, Nirma University, Ahmedabad, Gujarat, India
Emails: 22bce186@nirmauni.ac.in, 22bce352@nirmauni.ac.in, anuja.nair@nirmauni.ac.in,
tarjni.vyas@nirmauni.ac.in, shivani.desai@nirmauni.ac.in, sudeep.tanwar@nirmauni.ac.in
Abstract—This paper proposes a new approach of hydrocarbon
spill detection using hyperspectral imaging (HSI) and fine-tuning
ConvNeXt convolutional neural network (CNN). Hydrocarbon
spill hyperspectral dataset (HSHD) containing 124 HSIs into four
classes-cleans, gasoline, motor oil, and thinner is used in the
training as well as testing phase. To overcome the computationa