In [9]:
import os
import re
import PyPDF2
import requests
from io import BytesIO
from nltk.tokenize import sent_tokenize

def read_pdf_from_local(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += f"Page {page_num + 1}:\n{pdf_reader.pages[page_num].extract_text()}\n\n"
    return text

def read_pdf_from_url(url):
    response = requests.get(url)
    pdf_reader = PyPDF2.PdfReader(BytesIO(response.content))
    text = ''
    for page_num in range(len(pdf_reader.pages)):
        text += f"Page {page_num + 1}:\n{pdf_reader.pages[page_num].extract_text()}\n\n"
    return text

def create_tokens(text):
    # Split the text into paragraphs of 300 characters or at the end of a sentence
    sentences = sent_tokenize(text)
    tokens = []
    current_token = ""
    current_page = None
    for sentence in sentences:
        # Check for the "Page" marker
        match = re.match(r'Page (\d+):', sentence)
        if match:
            current_page = int(match.group(1))
        elif len(current_token) + len(sentence) <= 300:
            current_token += sentence + ' '
        else:
            tokens.append((current_page, current_token.strip()))
            current_token = sentence + ' '
    if current_token:
        tokens.append((current_page, current_token.strip()))
    return tokens

def save_tokens_to_file(tokens, file_path):
    # Create the folder and parent directory if they don't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    # Save all tokens to a single file
    with open(file_path, 'w', encoding='utf-8') as file:
        for page, token in tokens:
            file.write(f"Page {page}:\n{token}\n\n")

def main():
    # Ask the user for the PDF file path or URL
    pdf_source = input("Enter the path to the local PDF file or the URL of the PDF: ")

    # Check if the input is a local file path or a URL
    if pdf_source.startswith(('http://', 'https://')):
        # Read PDF from URL
        pdf_text = read_pdf_from_url(pdf_source)
    else:
        # Read PDF from local file
        pdf_text = read_pdf_from_local(pdf_source)

    # Tokenize the text into paragraphs with page numbers
    tokens = create_tokens(pdf_text)

    # Prompt the user for the file name
    file_name = input("Enter the desired file name (including extension, e.g., all_tokens.txt): ")

    # Create a folder named "knowledge_base" and save all tokens to the specified file
    knowledge_base_folder = "knowledge_base"
    knowledge_base_file = os.path.join(knowledge_base_folder, file_name)
    save_tokens_to_file(tokens, knowledge_base_file)

    print(f"Tokens saved to the knowledge_base folder in the file: {file_name}")

if __name__ == "__main__":
    main()


Tokens saved to the knowledge_base folder in the file: AI


In [11]:
import os
import re
import PyPDF2
import requests
from io import BytesIO
from nltk.tokenize import sent_tokenize

def read_pdf_from_local(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += f"Page {page_num + 1}: {pdf_reader.pages[page_num].extract_text()}\n"
    return text

def read_pdf_from_url(url):
    response = requests.get(url)
    pdf_reader = PyPDF2.PdfReader(BytesIO(response.content))
    text = ''
    for page_num in range(len(pdf_reader.pages)):
        text += f"Page {page_num + 1}: {pdf_reader.pages[page_num].extract_text()}\n"
    return text

def create_tokens(text):
    # Split the text into paragraphs of 300 characters or at the end of a sentence
    sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
    tokens = []
    current_token = ""
    for sentence in sentences:
        if len(current_token) + len(sentence) <= 300:
            current_token += sentence + ' '
        else:
            tokens.append(current_token.strip())
            current_token = sentence + ' '
    if current_token:
        tokens.append(current_token.strip())
    return tokens

def save_tokens_to_file(tokens, file_path):
    # Create the folder and parent directory if they don't exist
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
    # Save all tokens to a single file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write('\n'.join(tokens))

def main():
    # Ask the user for the PDF file path or URL
    pdf_source = input("Enter the path to the local PDF file or the URL of the PDF: ")

    # Check if the input is a local file path or a URL
    if pdf_source.startswith(('http://', 'https://')):
        # Read PDF from URL
        pdf_text = read_pdf_from_url(pdf_source)
    else:
        # Read PDF from local file
        pdf_text = read_pdf_from_local(pdf_source)

    # Tokenize the text into paragraphs
    tokens = create_tokens(pdf_text)

    # Prompt the user for the file name
    file_name = input("Enter the desired file name (including extension, e.g., all_tokens.txt): ")

    # Create a folder named "knowledge_base" and save all tokens to the specified file
    knowledge_base_folder = "knowledge_base"
    knowledge_base_file = os.path.join(knowledge_base_folder, file_name)
    save_tokens_to_file(tokens, knowledge_base_file)

    print(f"Tokens saved to the knowledge_base folder in the file: {file_name}")

if __name__ == "__main__":
    main()


Tokens saved to the knowledge_base folder in the file: AI
