In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!apt install libtesseract-dev libleptonica-dev

In [None]:
!pip install tesserocr

In [None]:
!pip install pdfminer.six PyPDF2 pymupdf wrapt_timeout_decorator

# Parse PDF-files

In [None]:
from io import StringIO
import re
import os
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
import fitz
from wrapt_timeout_decorator import timeout
from logging import getLogger
from tqdm import tqdm

_threshold_intersection = 0.3  # if the intersection is large enough.

logger = getLogger()
def normalize_spaces(text: str) -> str:
    """Removes single line breaks to highlight paragraphs correctly"""
    return re.sub("[ ]{1,}", " ",
                  re.sub("(?<!\n)\n(?!\n){1,}", " ",
                         text)).strip()


def _check_contain(r_word, points):
    """If `r_word` is contained in the rectangular area.

    The area of the intersection should be large enough compared to the
    area of the given word.

    Args:
        r_word (fitz.Rect): rectangular area of a single word.
        points (list): list of points in the rectangular area of the
            given part of a highlight.

    Returns:
        bool: whether `r_word` is contained in the rectangular area.
    """
    r = fitz.Quad(points).rect
    r.intersect(r_word)

    if r.get_area() >= r_word.get_area() * _threshold_intersection:
        contain = True
    else:
        contain = False
    return contain


def _extract_annot(annot, words_on_page):
    """Extract words in a given highlight.

    Args:
        annot (fitz.Annot): [description]
        words_on_page (list): [description]

    Returns:
        str: words in the entire highlight.
    """

    quad_points = annot.vertices
    quad_count = int(len(quad_points) / 4)
    sentences = ['' for i in range(quad_count)]
    for i in range(quad_count):
        points = quad_points[i * 4: i * 4 + 4]
        words = [
            w for w in words_on_page if
            _check_contain(fitz.Rect(w[:4]), points)
        ]
        sentences[i] = ' '.join(w[4] for w in words)
    sentence = ' '.join(sentences)

    return sentence


@timeout(5, use_signals=False)
def extract_annots(path: str):
    """For annotation extraction from pdf files"""
    doc = fitz.open(path)

    highlights_d = {}
    for page_num, page in enumerate(doc):
        try:
            wordlist = page.get_text("words")
            highlights = []
            annot = page.first_annot
            while annot:
                phrase = _extract_annot(annot, wordlist)
                highlights.append(phrase)
                annot = annot.next

                highlights_d[page_num] = highlights
        except Exception as ex:
            logger.info(f'Error in extract_annots {ex}')
    return highlights_d

@timeout(2, use_signals=False)
def read_pdf_page(page, codec: str):
    """Read pdf page"""
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    interpreter.process_page(page)
    return retstr.getvalue()


@timeout(120, use_signals=False)
def convert_pdf_to_txt(path: str, codec: str = "utf-8"):
    with open(path, "rb") as fp:
        pages = PDFPage.get_pages(fp,
                                  set(),
                                  maxpages=0,
                                  password="",
                                  caching=True,
                                  check_extractable=True)
        parsed_result = {}


        try:
            annots = extract_annots(path)
        except Exception as ex:
            logger.error(f'Cant get annotations from  {path} because {str(ex)}')
            annots = {}
        for page_num, page in enumerate(pages):
            try:
              text = normalize_spaces(read_pdf_page(page, codec))

              parsed_result[page_num] = {"text": text,
                                        "annots": annots.get(page_num, [])}
              if text.startswith("Графическая часть"):
                  # пропускаем картинки
                  logger.warning(f'Skip image page {page_num} from {path}')
                  break
            except Exception as ex:
                # пропускаем кривые страницы с изображениями
                logger.error(f'Skip image page {page_num} from {path} becouse {str(ex)}')
                parsed_result[page_num] = {"text": '',
                                          "annots": []}

    return parsed_result

def parse_pdf_bucket(folder: str):
    documents = {}

    group_name = folder.split("/")[-1]
    documents[group_name] = documents.get(group_name, {})
    for fn in tqdm(os.listdir(folder)):
        print('FILE:', fn)
        filename = os.path.join(folder, fn)
        try:
            res = convert_pdf_to_txt(filename)
            documents[group_name][fn] = res
        except Exception as ex:
            logger.error(f'Error while document {filename} processing {str(ex)}')
            documents[group_name][fn] = 'long time parsing'
    return documents

In [None]:
%%time
folders = ['drive/MyDrive/jetfork_2023_dataset/doc1',
           'drive/MyDrive/jetfork_2023_dataset/doc2',
           'drive/MyDrive/jetfork_2023_dataset/ПД для ИИ'
           ]

all_documents = {}
for folder in folders:
  documents = parse_pdf_bucket(folder)
  all_documents.update(documents)

In [None]:
all_documents.keys()

In [None]:
import joblib

In [None]:
joblib.dump(all_documents, 'all_documents_v3.pkl')