## Import Required Libraries


In [29]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import requests
import xml.etree.ElementTree as ET
from pdfminer.high_level import extract_text
import os
from peft import PeftModel, PeftConfig
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer
)

## Set Search Query

In [30]:
query = "semantic segmentation"

## Fetching Papers
Function to Fetch ArXiv Papers

In [31]:
def fetch_arxiv_papers(query):
    """"
    Fetches the arXiv papers for the given query

    Args:
        query (str): The query to search arXiv for papers
        
    Returns:
        pdf_links (list): List of pdf links for the papers
    """

    url = f"http://export.arxiv.org/api/query?search_query={query}&start=0&max_results=1"

    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to fetch data from arXiv")
        return []

    root = ET.fromstring(response.text)
    pdf_links = []
    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
        for link in entry.findall("{http://www.w3.org/2005/Atom}link"):
            if link.attrib.get("title") == "pdf":
                pdf_links.append(link.attrib["href"])

    return pdf_links

Fetch PDF Links

In [32]:
pdf_links = fetch_arxiv_papers(query)
if pdf_links:
    print(f"Found PDF link: {pdf_links[0]}")
else:
    print("No PDF links found.")

Found PDF link: http://arxiv.org/pdf/2304.10326v1


Download PDF File


In [33]:
pdf_file = "1.pdf"

response = requests.get(pdf_links[0])

if response.status_code == 200:
    with open(pdf_file, "wb") as f:
        f.write(response.content)
    print(f"PDF saved to {pdf_file}")
else:
    print("Failed to fetch PDF")

PDF saved to 1.pdf


Extract Text from PDF

In [34]:
pdf_path = "1.pdf"
text = extract_text(pdf_path)

## Model 1: T5-small
Loading the fine-tuned T5-small model for summarization

In [35]:
model1 = T5ForConditionalGeneration.from_pretrained("../practice/t5-small-finetuned-arxiv")
model_checkpoint1 = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint1)

Summarization Function for T5-small

In [36]:
def test_summary_t5_small(input_text):
    """
    Generates a summary for the given input text using the T5 small model

    Args:
        input_text (str): The input text to summarize

    Returns:
        str: The generated summary
    """
    inputs = tokenizer(
        "summarize: " + input_text,
        max_length=512,
        truncation=True,
        return_tensors="pt"
    )
    
    outputs = model1.generate(
        inputs.input_ids,
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9,

    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

## Model 2: FLAN-T5-base
Load the PEFT-tuned FLAN-T5-base model

In [37]:
peft_model_id = "../practice/lora-flan-t5-base/model" 
config = PeftConfig.from_pretrained(peft_model_id)

model2 = AutoModelForSeq2SeqLM.from_pretrained(
    config.base_model_name_or_path,
    device_map="auto" 
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model2 = PeftModel.from_pretrained(model2, peft_model_id)

  adapters_weights = torch.load(


Define Summarization Function for FLAN-T5-base

In [38]:
def test_summary_FLAN_T5_base(input_text):
    """
    Generates a summary for the given input text using the FLAN T5 base model

    Args:
        input_text (str): The input text to summarize

    Returns:
        str: The generated summary
    """
    inputs = tokenizer(
        "summarize: " + input_text,
        max_length=1024,
        truncation=True,
        return_tensors="pt"
    ).to(model2.device)

    outputs = model2.generate(
        input_ids=inputs.input_ids, 
        max_new_tokens=512,
        do_sample=True,
        top_p=0.9
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

Generate Summaries Using Both Models

In [39]:
summary_t5_small = test_summary_t5_small(text)

In [40]:
summary_FLAN_T5_base = test_summary_FLAN_T5_base(text)

Print Generated Summaries

In [41]:
print(f"Summary from T5-small: {summary_t5_small}")

Summary from T5-small: in this technical report, we demonstrate our solution for 2019 COCO panoptic segmentation task. we use instance segmentation and semantic segmen- tation as a method which performs the s c in- stance and semantic segmen- tation separately, then combines the two to generate panop-tic segmentation results. our technique performs the instance segmentation and semantic segmen- tation separately, then combines the two to generate panop-tic segmentation results. to facilitate the results in this results, we use the same way as we have achieved an impressive re- sult, we compare the results of


In [42]:
print(f"Summary from FLAN-T5-base: {summary_FLAN_T5_base}")

Summary from FLAN-T5-base: **Main Research Objective:** To develop a joint COCO/Mapillary Workshop strategy for the 2019 COCO panoptic segmentation challenge. **Methodology:** * Created a dynamic model, Mask R-CNN, to extract the pixel-level category labels of a given image. * Recommendations to a network architecture for mask segmentation and semantic segmentation. * Contested several combinations of in-stance and semantic segmentation to achieve a strong performance. **Key Findings:** * Mask R-CNN was the best model for in-stance segmentation. * Mask R-CNN extended fast R-CNN (Fast R-CNN) to achieve the highest possible performance. * HTC model provided the best in-stance segmentation results for both the mask segmentation task and the semantic segmentation task. * The combined model was optimized for the best results on the Coco-Mapillary Workshop dataset. * The model achieved an optimal performance in the Coco-Mapillary Workshop task, achieving an overall performance of 47.1 based 

Cleanup: Remove Downloaded PDF

In [43]:
os.remove(pdf_file)