In [0]:
import re
import unicodedata
from docx import Document #reading docx
from PyPDF2 import PdfReader #reading pdf

def clean_text(text: str) -> str: #expecting it to be and to return a string
   
    text = unicodedata.normalize("NFKC", text)  # Unicode normalization
    text = text.lower()
    text = re.sub(r'(\w)-\s+(\w)', r'\1\2', text) # Fix hyphenated line breaks (common in PDFs)
    text = re.sub(r'[\r\n]+', '\n', text) # Collapse multiple newlines but keep structure
    text = re.sub(r'\s+', ' ', text) # Collapse extra spaces

    return text.strip()

def extract_text(file_path):

    ext = file_path.lower().split('.')[-1] #ensuring .PDF, .pdf, .Pdf is the same thing

    if ext == 'pdf':
        reader = PdfReader(file_path)
        pages = []
        for page in reader.pages:
            pages.append(page.extract_text() or '')
        text = '\n'.join(pages)

    elif ext == 'docx':
        doc = Document(file_path)
        text = '\n'.join(p.text for p in doc.paragraphs)

        # for tables in docx files
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    text += '\n' + cell.text

    elif ext == 'txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

    else:
        raise ValueError(f"Unsupported file format: {ext}")

    return clean_text(text)

    # Example:
text = extract_text ("/Volumes/ekp_lm/employee_portal/documents/Priručnik zaštite na radu.docx")
print(text[:500])