In [1]:
import fitz
import re
import os

In [2]:
# change working directory to the /Users/tildeidunsloth/Desktop/DatSci_25/DatSci_25
os.chdir('/Users/tildeidunsloth/Desktop/DatSci_25/DatSci25')

In [3]:
def extract_text_from_pdf_fitz(pdf_path):
    doc = fitz.open(pdf_path)
    raw_text = "\n".join(page.get_text() for page in doc)
    doc.close()

    # Fix hyphenated line breaks: "beslut-\nningsforslaget" → "beslutningsforslaget"
    text = re.sub(r'-\n(\w+)', r'\1', raw_text)

    # Convert single line breaks to spaces (but keep paragraph breaks)
    # Detect double line breaks or lines starting with indent (paragraphs)
    lines = text.splitlines()
    new_lines = []
    paragraph = []

    for line in lines:
        stripped = line.strip()
        if not stripped:
            # Empty line → paragraph break
            if paragraph:
                new_lines.append(" ".join(paragraph))
                paragraph = []
        elif re.match(r'^\s', line):
            # Indented line → likely a new paragraph
            if paragraph:
                new_lines.append(" ".join(paragraph))
                paragraph = [stripped]
            else:
                paragraph.append(stripped)
        else:
            paragraph.append(stripped)

    # Append any remaining text as a final paragraph
    if paragraph:
        new_lines.append(" ".join(paragraph))

    # Join paragraphs with two newlines
    cleaned_text = "\n\n".join(new_lines)

    return cleaned_text

def convert_all_pdfs(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(input_folder, filename)
            txt_filename = os.path.splitext(filename)[0] + '.txt'
            txt_path = os.path.join(output_folder, txt_filename)

            print(f"Converting {filename}...")
            try:
                text = extract_text_from_pdf_fitz(pdf_path)
                with open(txt_path, 'w', encoding='utf-8') as f:
                    f.write(text)
            except Exception as e:
                print(f"❌ Failed to process {filename}: {e}")

    print(f"\n✅ All PDFs converted to .txt in: {output_folder}")



In [4]:
input_folder = 'data/folketing_referater'
output_folder = 'data/txt'

convert_all_pdfs(input_folder, output_folder)

Converting 20161_m13.pdf...
Converting 20091_m79.pdf...
Converting 20171_m13.pdf...
Converting 20081_m79.pdf...
Converting 20081_m51.pdf...
Converting 20211_m56.pdf...
Converting 20201_m107.pdf...
Converting 20141_m83.pdf...
Converting 20091_m45.pdf...
Converting 20201_m42.pdf...
Converting 20151_m97.pdf...
Converting 20201_m56.pdf...
Converting 20151_m83.pdf...
Converting 20091_m51.pdf...
Converting 20201_m113.pdf...
Converting 20211_m42.pdf...
Converting 20141_m97.pdf...
Converting 20081_m45.pdf...
Converting 20012_m75.pdf...
Converting 20151_m68.pdf...
Converting 20012_m61.pdf...
Converting 20141_m68.pdf...
Converting 20231_m11.pdf...
Converting 20131_m101.pdf...
Converting 20231_m39.pdf...
Converting 20141_m40.pdf...
Converting 20091_m6.pdf...
Converting 20211_m95.pdf...
Converting 20081_m92.pdf...
Converting 20151_m54.pdf...
Converting 20201_m81.pdf...
Converting 20012_m49.pdf...
Converting 20091_m86.pdf...
Converting 20091_m92.pdf...
Converting 20151_m40.pdf...
Converting 20191_m