In [1]:
# Źródło funkcji: https://github.com/GymMachineLearning/ml_model_alt/blob/develop/notebooks/nlp.ipynb

In [2]:
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
import json

from whoosh import index
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from transformers import pipeline

# Wczytywanie pliku pdf

In [3]:
schema = Schema(content=TEXT(stored=True))
index_dir = "indexdir"
ix = index.create_in(index_dir, schema)


writer = ix.writer()

In [4]:
def convert(writer,fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
        page_text = output.getvalue()
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text 

In [6]:
# ścieżka przerabianego pdf'a
filename = "Starting_Strength"
pdf_file = f"../data/pdf/{filename}.pdf"

In [19]:
# Tworzenie schematu i indeksu
schema = Schema(content=TEXT(stored=True))
index_dir = "index"

# Sprawdzenie, czy katalog indeksu istnieje, jeśli nie, to go tworzymy
if not os.path.exists(index_dir):
    os.mkdir(index_dir)

# Sprawdzenie, czy indeks już istnieje
if index.exists_in(index_dir):
    ix = index.open_dir(index_dir)
else:
    ix = index.create_in(index_dir, schema)

# Dodawanie dokumentów do indeksu
def add_documents(writer, documents):
    for content in documents:
        writer.add_document(content=content)
    writer.commit()

# Konwersja PDF na tekst i dodawanie do indeksu
def convert_and_index(writer, fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    manager = PDFResourceManager()
    laparams = LAParams()
    output_text = []  # Lista do przechowywania tekstu z każdej strony

    infile = open(fname, 'rb')
    for page_number, page in enumerate(PDFPage.get_pages(infile, pagenums), start=1):
        output = StringIO()
        converter = TextConverter(manager, output, laparams=laparams)
        interpreter = PDFPageInterpreter(manager, converter)
        
        interpreter.process_page(page)
        
        # Uzyskujemy tekst z aktualnej strony
        page_text = output.getvalue()
        output_text.append(page_text)  # Dodajemy tekst strony do listy
        # Zamykamy konwerter i bufor po przetworzeniu każdej strony
        converter.close()
        output.close()

    infile.close()

    # Dodajemy przetworzone strony do indeksu
    add_documents(writer, output_text)
    return output_text

# Uruchamianie konwersji i indeksowania
writer = ix.writer()
text_pages = convert_and_index(writer, pdf_file, pages=[1, 2, 3])


In [18]:
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("The Squat")
    results = searcher.search(query)
    for hit in results:
        print(hit['content'])

Contents 

Strength: Why and How .............................................................................................................. 2 

The Squat ................................................................................................................................ 8 

The Bench Press ......................................................................................................................... 66 

The Deadlift ............................................................................................................................. 104 

The Press .................................................................................................................................. 148 

The Power Clean ......................................................................................................... 168 

Useful Assistance Exercises ....................................................................................................... 208 

Programm

# Zapisz wczytaną książkę 

In [32]:
text_pages[1:3]

['2nd Edition (revised) \n\nCopyright © 2007 by Mark Rippetoe and Lon Kilgore \n\nEditor - Stef Bradford, Ph.D. \n\nAll rights reserved No pan of this publication may be reproduced, stored in a retrieval system or \ntransmitted in a form by means, electronic, mechanical, photocopied, recorded, or otherwise without the \npoor written  consent of the publisher. The  authors  and publisher disclaim  any responsibility  for  any \nadverse effects or consequences  from  the misapplication  or injudicious use of the information presented \nin this text. \n\nISBN  0-9768054-2-1 \n\nPrinted in  the United States of America  10  9  8  7 \n\nThe Aasgaard  Company \n\n3118 Buchanan, Wichita Falls, TX 76308, USA \n\nii \n\n\x0c',
 'Contents \n\nStrength: Why and How .............................................................................................................. 2 \n\nThe Squat ............................................................................................................

In [28]:
pdf_output_path = f"../data/decoded_pdfs/{filename}.json"

In [31]:
# Tworzymy słownik, gdzie klucze to numery stron
pages_dict = {f"page-{i+1}": text for i, text in enumerate(text_pages)}

# Zapisujemy do pliku JSON
with open(pdf_output_path, "w", encoding="utf-8") as file:
    json.dump(pages_dict, file, ensure_ascii=False, indent=4)

# Odczyt pliku

In [34]:
# Odczyt z pliku JSON
with open(pdf_output_path, "r", encoding="utf-8") as file:
    pages_dict = json.load(file)

# Pobieramy konkretną stronę
page_number = "page-2"
print(pages_dict.get(page_number, "Strona nie istnieje"))

2nd Edition (revised) 

Copyright © 2007 by Mark Rippetoe and Lon Kilgore 

Editor - Stef Bradford, Ph.D. 

All rights reserved No pan of this publication may be reproduced, stored in a retrieval system or 
transmitted in a form by means, electronic, mechanical, photocopied, recorded, or otherwise without the 
poor written  consent of the publisher. The  authors  and publisher disclaim  any responsibility  for  any 
adverse effects or consequences  from  the misapplication  or injudicious use of the information presented 
in this text. 

ISBN  0-9768054-2-1 

Printed in  the United States of America  10  9  8  7 

The Aasgaard  Company 

3118 Buchanan, Wichita Falls, TX 76308, USA 

ii 


