# 🧾 Extract Text from Ukrainian Dissertation PDF
This notebook extracts and cleans text from a PDF document using PyMuPDF (`fitz`).

In [None]:
!pip install pymupdf

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import fitz  # PyMuPDF
import re

def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'Рис\..*|Табл\..*', '', text)
    text = re.sub(r'[^\S\r\n]{2,}', ' ', text)
    return text.strip()

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    chunks = []
    for page in doc:
        text = page.get_text()
        text = clean_text(text)
        if len(text.split()) > 50:
            chunks.append(text)
    return chunks

pdf_path = next(iter(uploaded))
chunks = extract_text_from_pdf(pdf_path)

#  Saving chanks to txt file
with open("chunks.txt", "w", encoding="utf-8") as f:
    for chunk in chunks:
        f.write(chunk.strip().replace("\n", " ") + "\n")

print(f"Extracted {len(chunks)} cleaned chunks.")
print(chunks[0][:500])