In [2]:
# Import necessary libraries
import os
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

print("Libraries imported successfully")

Libraries imported successfully


In [None]:
# Construct the path to the PDF file in the data directory
# Assumes the notebook is running in the 'backend' directory
pdf_file_path = os.path.join('..', 'data', 'Attention Is All You Need.pdf')

# Check if the file exists
if not os.path.exists(pdf_file_path):
    raise FileNotFoundError(f"The file {pdf_file_path} does not exist.")
else:
    print(f"Loading PDF from: {pdf_file_path}")
    try:
        # Initialize the PDF reader
        reader = PdfReader(pdf_file_path)

        # Extract text from the PDF
        full_document_text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                full_document_text += page_text + "\n"  # Add a newline between pages
            
        print(f"Successfully loaded document with {len(full_document_text)} characters.")

    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
        full_document_text = None  # Ensure variable exists but is none on error

Loading PDF from: ../data/Attention Is All You Need.pdf
Successfully loaded document with 39602 characters.


In [None]:
# Ensure text was loaded before proceeding
if full_document_text:
    print("Initializing text splitter...")
    # Initialize the Recursive Character Text Splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,  #TODO: Might change to tokenizer later
        separators=["\n\n", "\n", " ", ""]  # Priorize splitting by paragraph, then line, then space
    )

    print("Splitting document into chunks...")
    # Split the document into chunks
    text_chunks = text_splitter.split_text(full_document_text)

    print(f"Document split into {len(text_chunks)} chunks.")

    # Examine the first few chunks
    print("\n--- First Chunk ---")
    print(text_chunks[0])
    print("\n--- Second Chunk ---")
    print(text_chunks[1])
    print("\n--- Third Chunk ---")
    print(text_chunks[2])

else:
    print("Skipping chunking because document text was not loaded.")

Initializing text splitter...
Splitting document into chunks...
Document split into 50 chunks.

--- First Chunk ---
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗ †
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗ ‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architectu