In [1]:
from langchain_ollama import ChatOllama
from langchain_groq import ChatGroq
from langchain.prompts import ChatPromptTemplate, PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from typing import List, Dict, Any
from typing_extensions import TypedDict
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langgraph.graph import StateGraph, START, END
from IPython.display import Image, display
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.output_parsers import PydanticOutputParser
import os
from dotenv import load_dotenv # type: ignore
from typing import List, Dict, Any, Optional
import fitz # type: ignore
from pydantic import BaseModel, Field # type: ignore
import regex as re
import string
load_dotenv()

False

In [2]:
os.environ["LANGSMITH_PROJECT"] = f"MineD 2025"
os.environ["GOOGLE_API_KEY"] = f"AIzaSyBQU_iCwA34u1XbnJcekcNEOkgFb3PHJZM"


In [75]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.1,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [76]:
class ResPaperText(BaseModel):
    authors: str = Field(..., description="List of authors of the research paper")
    title: str = Field(..., description="Title of the research paper")
    submission_date: str = Field(..., description="Submission date of the research paper")
    keywords: List[str] = Field(..., description="List of keywords associated with the research paper")
    references: List[str] = Field(..., description="List of references cited in the research paper")
    abstract: str = Field(..., description="Abstract of the research paper")
    conclusion: str = Field(..., description="Conclusion of the research paper")
    summary: str = Field(..., description="Summary of the research paper")

class SlideContent(BaseModel):
    title: str = Field(..., description="Title of the particular slide")
    bullet_points: Optional[List[str]] = Field(None, description="Content in bullet points form for the slide")
    notes: Optional[str] = Field(None, description="Additional notes for the slide")
    images: Optional[List[str]] = Field(None, description="List of relevant image paths for the slide")

class PPTPresentation(BaseModel):
    title: str = Field(..., description="Title of the presentation")
    authors: List[str] = Field(..., description="List of authors of the presentation")
    institution: str = Field(..., description="Institution associated with the presentation")
    slides: List[SlideContent] = Field(..., description="List of slides, in the presentation,which are SlideContent schemas.")
    

class Dialogue(BaseModel):
    text: str = Field(..., description="The text of dialogue")

class Conversation(BaseModel):
    katherine: List[Dialogue] = Field(..., description="Katherine's dialogues")
    clay: List[Dialogue] = Field(..., description="Clay's dialogues")
    order: List[str] = Field(..., description="The order of dialogues denoted by the names of the speaker")

In [77]:
class ResPaperExtractState(TypedDict):
    pdf_path: Optional[str] = None  # Path to the PDF file
    extracted_text: Optional[str] = None  # Full extracted text from the PDF
    extracted_images: Optional[List[str]] = None  # Paths to extracted images
    slides_content: Optional[List[Dict[str, str]]] = None  # Prepared content for PowerPoint slides
    metadata: str
    ppt_object: PPTPresentation
    convo: Conversation

In [78]:
def load_pdf(state: ResPaperExtractState):
    pdf_path = state["pdf_path"]
    doc = fitz.open(pdf_path)  # Load the PDF only once
    
    extracted_text = []
    extracted_images = []
    output_folder = "extracted_images"
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through each page
    for page_number, page in enumerate(doc):
        # Extract text
        text = page.get_text("text")
        extracted_text.append(text)

        # Extract images
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            img_filename = f"{output_folder}/page_{page_number+1}_img_{img_index+1}.{image_ext}"
            
            with open(img_filename, "wb") as img_file:
                img_file.write(image_bytes)
            
            extracted_images.append(img_filename)

    # Combine text from all pages
    full_text = "\n".join(extracted_text)

    # Update state
    return {"extracted_text": full_text, "extracted_images": extracted_images}

In [79]:
system_message_condensation = SystemMessagePromptTemplate.from_template(
    """You are an expert AI based researcher, your task is to find out key innovations and the overall summary from a given research paper, you should include the brief information about the authors of the paper,
    The type of paper, the domain of the research paper,title, author, submission date, summary, literature review, methods used, results, discussion, conclusion and references of the given paper.
    
    Try to quantify the summary wherever necessary, also include number of result sections in the conclusion
    
    The summary should contain the simplified summary of the research paper, not the actual abstract, make sure it is lengthy enough to cover all the ideas discussed in the paper, under 2000 words. 
    
    Additional information: 
        - Dont leave out any ideas from the paper
        - Use your knowledge to connects the dots for very hard concepts
        - Give a summary in under 2000 words
        - Also present any innovative ideas to carry out this work
        - Give output in json format
    
    The format for the extraction is as follows: {format_instructions}
    """
)

# Human Message: Supplies extracted text from the research paper
human_message_condensation = HumanMessagePromptTemplate.from_template("Here is the extracted text:\n\n{extracted_text}")

parser_metadata = JsonOutputParser(pydantic_object=ResPaperText)
# Combine into a structured chat prompt
chat_prompt_metadata = ChatPromptTemplate(
    messages=[system_message_condensation, human_message_condensation],
    partial_variables={"format_instructions": parser_metadata.get_format_instructions()}
)

def condense_data(state):
    extracted_text = state["extracted_text"]
    
    prompt = chat_prompt_metadata.invoke({"extracted_text":extracted_text})
    llm_out = llm.invoke(prompt)
    llm_out.content = llm_out.content.replace("```json", "```")
    parsed = parser_metadata.invoke(llm_out)
    
    return {"metadata": parsed}

In [80]:
system_message_ppt = SystemMessagePromptTemplate.from_template(
    """You are an expert in creating PowerPoint presentations. Generate a structured PowerPoint (PPT) presentation 
    that summarizes a research paper based on the provided extracted text. Follow these instructions:
    
    Remember that the objective of this PPT is for a third party to understand the key points of the research paper, and 
    give them a gist of the research paper.

    - Title Slide: Include the research paper title, authors, and institution.
    - Introduction Slide: Summarize the problem, objectives, and motivation.
    - Methods Slide: Briefly explain the methodology, datasets, and experimental setup.
    - Results Slide: Summarize key findings with bullet points. Mention any visuals (graphs, tables) found from the extracted text. You should definetly mention in the presentation any figures related to a performance metric or tables that are mentioned in the extracted text.
    - Discussion Slide: Explain the significance of results and compare with prior work.
    - Conclusion Slide: Summarize key takeaways and potential future work.
    - References Slide: Include citations if available.

    Additional Guidelines:
    - Keep slides concise (use bullet points).
    - Maintain a professional and visually appealing slide design.
    - Give the text in markdown format.
    - Each slide should have rich information content, summarizing the information related to the particular slide heading, 
    and also include some content that is related to the slide heading but not directly mentioned in the extracted text.
    - Also keep in mind that the text for each slide should not be too lengthy, and should be concise and to the point.

    {format_instructions}
    """
)

# Human Message: Supplies extracted text from the research paper
human_message_ppt = HumanMessagePromptTemplate.from_template("Here is the summary of the research paper:\n\n{metadata}")
parser_ppt = JsonOutputParser(pydantic_object=PPTPresentation)

# Combine into a structured chat prompt
chat_prompt_ppt = ChatPromptTemplate(
    messages=[system_message_ppt, human_message_ppt],
    partial_variables={"format_instructions": parser_ppt.get_format_instructions()}
)

def get_ppt_data(state):
    metadata = state["metadata"]
    prompt = chat_prompt_ppt.invoke({"metadata": metadata, "tone": "formal"})
    llm_out = llm.invoke(prompt)
    parsed = parser_ppt.invoke(llm_out)
    return {"ppt_object": parsed}

In [81]:
system_message_podcast = SystemMessagePromptTemplate.from_template(
    """You are an expert in creating/writing scripts for podcast, consider the given scenario, Two people one girl and one boy who are completing their B.Tech degree this year are discussing the given research paper to create an podcast of this research paper
    
    Boy's Name: Clay
    Girl's Name: Katherine
    
    The Girl has complete knowledge about this paper, while the boy doesn't know anything about the paper.
    
    Write a script for a podcast, wherein firstly the girl introduces the paper, but the boy seems clueless, so the boy ask the girl many questions about the paper.
    
    The boy's question should cover all the possible doubt that one can have regarding the paper, and the girl should answer that questions correctly.

    General Guideline:
    - Intro must include the name, application and the authors (and their institution)
    - Consider the audience to be technically sound, so you can ue jargons
    - The boys questions should cover all the aspects from methodology, results, literature review, etc
    - Dont make it too obvious that they are discussing about the paper
    - Make the order such that the question asked by clay in previous dialogue is answered by katherine in this dialogue.

    Additional Guidelines:
    - Output in JSON format, this JSON should have two keys, names of boys and girls, in lower case.
    - Each key corresponds to a list, their dialogues in sequential manner
    - Consider that the girl always starts first
    - Also give the order of dialogues, that are to be taken in a sequence
    - Make sure that the number of dialogues in the order and in the lists add up.
    - Both of them dont have to speak alternatively, they can heave continuous dialogues
    - Each and every question asked by clay has to be answered by katherine

    {format_instructions}
    """
)

# Human Message: Supplies extracted text from the research paper
human_message_podcast = HumanMessagePromptTemplate.from_template("Here is the summary of research paper:\n\n{metadata}. \nMake sure the tone is {tone}")

parser_podcast = JsonOutputParser(pydantic_object=Conversation)
# Combine into a structured chat prompt
chat_prompt_podcast = ChatPromptTemplate(
    messages=[system_message_podcast, human_message_podcast],
    partial_variables={"format_instructions": parser_podcast.get_format_instructions()}
)

def get_data_podcast(state):
    metadata = state["metadata"]
    prompt = chat_prompt_podcast.invoke({"metadata": metadata, "tone": "formal"})
    llm_out = llm.invoke(prompt)
    # llm_out.content = llm_out.content.replace("```json", "```")
    parsed = parser_podcast.invoke(llm_out)
    
    return {"convo": parsed}

In [82]:
builder = StateGraph(ResPaperExtractState)

builder.add_node("pdf-2-text", load_pdf)
builder.add_node("text-condensation", condense_data)
# builder.add_node("make-ppt-text", get_ppt_data)
builder.add_node("make-podcast-text", get_data_podcast)

builder.add_edge(START, "pdf-2-text")
builder.add_edge("pdf-2-text", "text-condensation")
# builder.add_edge("text-condensation", "make-ppt-text")
builder.add_edge("text-condensation", "make-podcast-text")
builder.add_edge("make-podcast-text", END)
# builder.add_edge( "make-ppt-text", END)

graph = builder.compile()

In [83]:
path = r"C:\\Users\\Mihir Patel\\Downloads\\1706.03762v7.pdf"
state_output = graph.invoke({"pdf_path": path})

In [84]:
print(state_output["metadata"]['abstract'])
print(state_output["metadata"]['summary'])

The paper introduces the Transformer, a novel neural network architecture based solely on attention mechanisms, eliminating recurrence and convolutions.  Evaluated on machine translation tasks, the Transformer surpasses existing models in quality, parallelization, and training speed.  It achieves a 28.4 BLEU score on the WMT 2014 English-to-German task and a 41.8 BLEU score on the WMT 2014 English-to-French task, setting new state-of-the-art results.  The model's generalizability is demonstrated through successful application to English constituency parsing.
This research paper, authored by a team of researchers primarily from Google Brain and Google Research, introduces the Transformer, a groundbreaking neural network architecture for sequence transduction tasks.  The key innovation lies in its reliance solely on attention mechanisms, abandoning the traditional recurrent and convolutional layers used in previous state-of-the-art models like recurrent neural networks (RNNs) and convolu

In [85]:
convo = state_output["convo"]
print(convo['katherine'].__len__())
print(convo['clay'].__len__())
print(convo['order'])

kat_index = 0
clay_index = 0

for speaker in convo['order']:
    dialogue = None
    if speaker == 'katherine':
        dialogue = convo['katherine'][kat_index]
        kat_index += 1
    else:
        dialogue = convo['clay'][kat_index]
        clay_index += 1
    print(f"{speaker}: {dialogue['text']}")

7
7
['katherine', 'clay', 'katherine', 'clay', 'katherine', 'clay', 'katherine', 'clay', 'katherine', 'clay', 'katherine', 'clay', 'katherine']
katherine: Hey Clay, ready to dive into this week's podcast topic?  It's a pretty groundbreaking paper, 'Attention is All You Need,' by Vaswani et al. from Google Brain and Google Research. It introduces the Transformer, a neural network architecture for sequence transduction tasks, focusing entirely on attention mechanisms.
clay: Okay, I get the encoder-decoder structure, but this multi-head self-attention... can you explain that in more detail? How does it work, and why is it so crucial?
katherine: The core idea is to replace recurrence and convolutions entirely with attention. This allows for significantly greater parallelization during training, leading to massive speedups.  They achieved state-of-the-art results on machine translation benchmarks like WMT 2014 English-to-German and English-to-French, and even showed promising results on Eng

In [None]:
ppt_content["authors"]

In [None]:
from pptx import Presentation
from pptx.util import Pt, Inches
from pptx.enum.text import PP_ALIGN, MSO_ANCHOR, MSO_AUTO_SIZE

def create_ppt_from_dict(ppt_data: dict, output_file: str = "presentation.pptx"):
    prs = Presentation()
    slide_width = prs.slide_width
    slide_height = prs.slide_height

    # Title Slide Fix
    title_slide_layout = prs.slide_layouts[0]  # Title slide layout
    title_slide = prs.slides.add_slide(title_slide_layout)
    title = title_slide.shapes.title
    main_title = title_slide.placeholders[0]
    main_title.top = Inches(2.5)  # Adjust the value as needed
    main_title.width = Inches(9)  # Set the width to a desired value
    main_title.left = (slide_width - main_title.width) // 2
    subtitle = title_slide.placeholders[1]
    subtitle.top = Inches(5.5)  # Adjust the value as needed
    subtitle.width = Inches(10)  # Set the width to a desired value

    # Title Formatting
    title.text = ppt_data.get("title", "Presentation Title")
    title.text_frame.paragraphs[0].font.size = Pt(40)  # Adjust title font size
    title.text_frame.paragraphs[0].alignment = PP_ALIGN.CENTER  # Ensure center alignment

    # Subtitle Formatting (Author & Institution)
    subtitle.text = ", ".join(ppt_data.get("authors", [])) + "\n" + ", ".join(ppt_data.get("institution", []))
    subtitle.text_frame.paragraphs[0].font.size = Pt(18)  # Reduce subtitle font size
    subtitle.text_frame.paragraphs[0].alignment = PP_ALIGN.CENTER

    # Adjust Subtitle Positioning
    left = subtitle.left
    top = int(subtitle.top * 0.7)  # Move subtitle slightly up
    width = subtitle.width
    height = subtitle.height
    subtitle.text_frame.auto_size = MSO_AUTO_SIZE.SHAPE_TO_FIT_TEXT
    subtitle.text_frame.word_wrap = True

    subtitle.text_frame.margin_top = Pt(15)  # Reduce top margin
    subtitle.text_frame.margin_bottom = Pt(5)  # Reduce bottom margin
    subtitle.text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE  # Center text in box

    # Add content slides
    for i in range(1, len(ppt_data["slides"])):
        slide_data = ppt_data["slides"][i]
        # Use a different layout for content slides to avoid duplicating the title slide layout
        slide_layout = prs.slide_layouts[1]  # Title & Content layout for content slides
        slide = prs.slides.add_slide(slide_layout)
        title = slide.shapes.title
        content = slide.placeholders[1]

        title.text = slide_data.get("title", "Slide Title")
        
        bullet_points = slide_data.get("bullet_points", [])
        if bullet_points:
            text_frame = content.text_frame
            text_frame.clear()  # Remove default placeholder text
            text_frame.word_wrap = True  # Enable text wrapping
            text_frame.auto_size = MSO_AUTO_SIZE.SHAPE_TO_FIT_TEXT  # Enable auto size for content

            # Set default font size based on slide type
            is_references = "references" in slide_data.get("title", "").lower()
            DEFAULT_FONT_SIZE = 14 if is_references else 24

            # Add bullet points
            for point in bullet_points:
                p = text_frame.add_paragraph()
                p.text = point
                p.font.size = Pt(DEFAULT_FONT_SIZE)

            # Adjust font size dynamically for non-references slides
        

    # Save PowerPoint file
    prs.save(output_file)
    print(f"PowerPoint presentation saved as {output_file}")



create_ppt_from_dict(ppt_content, "sufia.pptx")