In [1]:
import re
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langchain.document_loaders import PyPDFLoader
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Spacer, Image as ReportLabImage, Paragraph
from reportlab.lib.enums import TA_CENTER
from llmsherpa.readers import LayoutPDFReader
from bs4 import BeautifulSoup
import requests
from PIL import Image as PILImage
from spire.pdf import PdfDocument
from spire.pdf.common import ImageFormat

In [2]:
pdf_url = 'https://arxiv.org/pdf/1706.03762.pdf'
loader = PyPDFLoader(pdf_url)
text = str(loader.load())
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [3]:
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
doc = pdf_reader.read_pdf(pdf_url)
doc = doc.to_html()

In [4]:
soup = BeautifulSoup(doc, 'html.parser')

headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td'])
headers_text = [header.get_text(strip=True).strip() for header in headers]
headers_text = list(dict.fromkeys(headers_text))

pattern = re.compile(r'^\d+(\.\d+)*\s')

numbered_headers = [header for header in headers_text if pattern.match(header)]
intro_present = any(re.match(r'^1\s+Introduction', header, re.IGNORECASE) for header in numbered_headers)

if not intro_present:
    index_for_intro = next((i for i, header in enumerate(numbered_headers) if header.startswith('2 ')), 0)
    numbered_headers.insert(index_for_intro, '1 Introduction')


In [5]:
updated_pattern = re.compile(r'''
    ^                         # Начало строки
    (\d+(\.\d+)*)             # Номер раздела или подраздела (например, "3", "3.2", "3.2.1")
    (\s+[A-Za-z].*)           # Пробел и название раздела, начинающееся с буквы
    $                         # Конец строки
    |                         # ИЛИ
    ^\d+\s\d+\.\d+\s          # Начинается с чисел, разделенных пробелами, с точкой между числами
    (\d+\.\d+\s+)*            # Продолжается серией чисел с точками и пробелами
    \d+K?\.\d+                # Заканчивается на число с десятичной частью, возможно с "K"
    (\s\d+)*                  # За которым следуют пробелы и числа
    $                         # Конец строки
''', re.VERBOSE)
headers = [header for header in numbered_headers if updated_pattern.match(header)]
headers.append("References") if "References" in text else "REFERENCES"

In [6]:
response = requests.get(pdf_url)

if response.status_code == 200:
    with open('arXiv_doc.pdf', 'wb') as file:
        file.write(response.content)
    doc = PdfDocument()
    doc.LoadFromFile('arXiv_doc.pdf')
    
images = []

for i in range(doc.Pages.Count):
    page = doc.Pages.get_Item(i)
    for image in page.ExtractImages():
        images.append(image)
        
index = 0

for image in images:
    imageFileName = r'images\image_{0}.png'.format(index).format(index)
    index += 1
    image.Save(imageFileName, ImageFormat.get_Png())
    
doc.Close()

In [7]:
def summerize(text):
    inputs_no_trunc = tokenizer(text, max_length=None, return_tensors='pt', truncation=False)
    
    chunk_start = 0
    chunk_end = tokenizer.model_max_length  # == 1024 for Bart
    inputs_batch_lst = []
    space_token_id = tokenizer.encode(' ', add_special_tokens=False)[0]
    
    while chunk_start <= len(inputs_no_trunc['input_ids'][0]):
        try:
            current_chunk = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end].tolist()
            end_index = len(current_chunk) - 1 - current_chunk[::-1].index(space_token_id)
            chunk_end = chunk_start + end_index
        except ValueError:
            pass
        inputs_batch = inputs_no_trunc['input_ids'][0][chunk_start:chunk_end] # get batch of n tokens
        inputs_batch = torch.unsqueeze(inputs_batch, 0)
        inputs_batch_lst.append(inputs_batch)
        chunk_start = chunk_end + 1
        chunk_end = min(chunk_start + tokenizer.model_max_length, len(inputs_no_trunc['input_ids'][0]))
    
    summary_ids_lst = [model.generate(inputs, num_beams=4, max_length=500, early_stopping=True) for inputs in inputs_batch_lst]
                                                                    # тут 
    summary_batch_lst = []
    for summary_id in summary_ids_lst:
        summary_batch = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_id]
        summary_batch_lst.append(summary_batch[0])
    summary_all = '\n'.join(summary_batch_lst)
    return summary_all

In [8]:
patternForBrackets = re.compile(r'\[\s*\d+(?:,\s*\d+)*\s*\]')

pdf_path = "summary.pdf"
doc = SimpleDocTemplate(pdf_path, pagesize=A4)
styles = getSampleStyleSheet()
style = styles['Normal']
elements = []

for header in range(len(headers)-1):
    startHeader = headers[header]
    endHeader = headers[header + 1]
    print(endHeader)
    dot_count = startHeader.count('.')
    if dot_count == 0:
        header_style = styles['Heading2']
    elif dot_count != 0:
        header_style = styles['Heading5']
    
    pattern = re.compile(re.escape(startHeader) + "(.*?)" + re.escape(endHeader), re.DOTALL)
    match = pattern.search(text)
    if match:
        text_between = match.group(1)
        text_without_brackets = patternForBrackets.sub('', text_between)
        text_final = text_without_brackets.replace("\\n","")
        text_final = text_final.replace("\n","")

        summary_text = summerize(text_final)
        
        elements.append(Paragraph(startHeader, header_style))
        elements.append(Paragraph(summary_text, style))
        elements.append(Spacer(1, 12))
        

styles = getSampleStyleSheet()

for img_num in range(len(images)):
    image_path = f'images/image_{img_num}.png'
    pil_image = PILImage.open(image_path)
    real_width, real_height = pil_image.size

    dpi = 72  # или используйте реальное DPI изображения, если оно вам известно
    width_in_points = real_width / dpi * 10
    height_in_points = real_height / dpi * 10

    # Убедитесь, что размеры являются числами, а не строками
    width_in_points = float(width_in_points)
    height_in_points = float(height_in_points)

    # Добавление изображения с использованием ReportLab Image
    img = ReportLabImage(image_path, width_in_points, height_in_points)
    elements.append(img)
    
    # Добавление заголовка
    centered_style = ParagraphStyle(name='CenteredStyle', parent=styles['Normal'], alignment=TA_CENTER)
    elements.append(Paragraph(f"Figure {img_num}", centered_style))
    elements.append(Spacer(1, 12))
    
doc.build(elements)


2 Background
3 Model Architecture
3.2.1 Scaled Dot-Product Attention
3.2.2 Multi-Head Attention
3.2.3 Applications of Attention in our Model
3.3 Position-wise Feed-Forward Networks
3.4 Embeddings and Softmax
3.5 Positional Encoding
4 Why Self-Attention
5 Training
5.1 Training Data and Batching
5.2 Hardware and Schedule
5.3 Optimizer
5.4 Regularization
6 Results


Token indices sequence length is longer than the specified maximum sequence length for this model (1656 > 1024). Running this sequence through the model will result in indexing errors


7 Conclusion
References
