In [1]:
# Źródło funkcji: https://github.com/GymMachineLearning/ml_model_alt/blob/develop/notebooks/nlp.ipynb

In [2]:
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
import json

from whoosh import index
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from transformers import pipeline

# Wczytywanie pliku pdf

In [3]:
schema = Schema(content=TEXT(stored=True))
index_dir = "indexdir"
ix = index.create_in(index_dir, schema)


writer = ix.writer()

In [4]:
def convert(writer,fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
        page_text = output.getvalue()
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text 

In [5]:
# ścieżka przerabianego pdf'a
filename = "Starting_Strength"
pdf_file = f"../data/pdf/{filename}.pdf"

In [7]:
# Tworzenie schematu i indeksu
schema = Schema(content=TEXT(stored=True))
index_dir = "index"

# Sprawdzenie, czy katalog indeksu istnieje, jeśli nie, to go tworzymy
if not os.path.exists(index_dir):
    os.mkdir(index_dir)

# Sprawdzenie, czy indeks już istnieje
if index.exists_in(index_dir):
    ix = index.open_dir(index_dir)
else:
    ix = index.create_in(index_dir, schema)

# Dodawanie dokumentów do indeksu
def add_documents(writer, documents):
    for content in documents:
        writer.add_document(content=content)
    writer.commit()

# Konwersja PDF na tekst i dodawanie do indeksu
def convert_and_index(writer, fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    manager = PDFResourceManager()
    laparams = LAParams()
    output_text = []  # Lista do przechowywania tekstu z każdej strony

    infile = open(fname, 'rb')
    for page_number, page in enumerate(PDFPage.get_pages(infile, pagenums), start=1):
        output = StringIO()
        converter = TextConverter(manager, output, laparams=laparams)
        interpreter = PDFPageInterpreter(manager, converter)
        
        interpreter.process_page(page)
        
        # Uzyskujemy tekst z aktualnej strony
        page_text = output.getvalue()
        output_text.append(page_text)  # Dodajemy tekst strony do listy
        
        # Zamykamy konwerter i bufor po przetworzeniu każdej strony
        converter.close()
        output.close()

    infile.close()

    # Dodajemy przetworzone strony do indeksu
    add_documents(writer, output_text)
    return output_text

# Uruchamianie konwersji i indeksowania
writer = ix.writer()
text_pages = convert_and_index(writer, pdf_file, pages=[x for x in range(10,320)])


In [8]:
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("The Squat")
    results = searcher.search(query)
    for hit in results:
        print(hit['content'])

Useful Assistance Exercises 

Figure  7-12.  The  effect  of tibia  angle  on  hamstring  tightness. 
The more acute the knee,  the  shorter the hamstring;  this is  the 
function of the hamstring in  knee  flexion.  But when  the  knee 
angle  becomes  more  acute  without an  active  hamstring 
contraction - as  it does when  the  knees passively  travel  forward 
on  the  way  to  the  bottom  of the squat -  the  hamstrings  lose  the 
opportunity  to  contribute  to  hip  extension,  because  they  are 
already  shortened. 

So  the  primary  difference  between  the  back  squat 
and  front  squat  is  one  of degree  in  terms  of the  amount  of 
involvement  from  the  contributing  muscle  groups.  But 
the  primary  reason  for  the  difference  is  the  position  in 
which  the  system  is  in  balance  -  the  bar  in  both  cases 
must  be  over  the  middle  of the  foot,  and  the  correct  back 
angle  is the one that keeps it there. 

Learning  the  front  squat  is  

# Zapisz wczytaną książkę 

In [9]:
text_pages[1:3]

["Strength:  Why and How \n\nindividual  biomechanics  of the  human  using  it.  Barbells  permit the  minute  adjustments  during the \nmovement that allow  individual  anthropometry  to  be  expressed. \n\nFurthermore,  barbells  require the  individual  to  make  these  adjustments,  and  any  other ones \nthat might  be  necessary  to  retain  control  over the  movement  of the weight.  This  aspect of exercise \ncannot  be  overstated  -  the  control  of the  bar,  and  the  balance  and  coordination  demanded  of the \ntrainee,  are  unique  to  barbell  exercise  and  completely  absent  in  machine-based  training.  Since \nevery  aspect  of  the  movement  of  the  load  is  controlled  by  the  trainee,  every  aspect  of  that \nmovement is  being trained. \n\nThere  are  other  benefits  as  well.  All  of the  exercises  described  in  this  book involve varying \ndegrees  of skeletal  loading.  After  all,  the  bones  are  what  ultimately  support  the  weight  on  

In [10]:
pdf_output_path = f"../data/decoded_pdfs/{filename}.json"

In [11]:
# Tworzymy słownik, gdzie klucze to numery stron
pages_dict = {f"page-{i+1}": text for i, text in enumerate(text_pages)}

# Zapisujemy do pliku JSON
with open(pdf_output_path, "w", encoding="utf-8") as file:
    json.dump(pages_dict, file, ensure_ascii=False, indent=4)

# Odczyt pliku

In [12]:
# Odczyt z pliku JSON
with open(pdf_output_path, "r", encoding="utf-8") as file:
    pages_dict = json.load(file)

# Pobieramy konkretną stronę
page_number = "page-2"
print(pages_dict.get(page_number, "Strona nie istnieje"))

Strength:  Why and How 

individual  biomechanics  of the  human  using  it.  Barbells  permit the  minute  adjustments  during the 
movement that allow  individual  anthropometry  to  be  expressed. 

Furthermore,  barbells  require the  individual  to  make  these  adjustments,  and  any  other ones 
that might  be  necessary  to  retain  control  over the  movement  of the weight.  This  aspect of exercise 
cannot  be  overstated  -  the  control  of the  bar,  and  the  balance  and  coordination  demanded  of the 
trainee,  are  unique  to  barbell  exercise  and  completely  absent  in  machine-based  training.  Since 
every  aspect  of  the  movement  of  the  load  is  controlled  by  the  trainee,  every  aspect  of  that 
movement is  being trained. 

There  are  other  benefits  as  well.  All  of the  exercises  described  in  this  book involve varying 
degrees  of skeletal  loading.  After  all,  the  bones  are  what  ultimately  support  the  weight  on  the  bar. 
Bone