In [72]:
import re
import os
import shutil

In [73]:
# change working directory to the /Users/tildeidunsloth/Desktop/DatSci_25/DatSci_25
os.chdir('/Users/tildeidunsloth/Desktop/DatSci_25/DatSci25')
# Set the path to the folder containing the transcript files
folder_path = 'data/txt'

In [None]:
def clean_text_formatting(text):
    """
    Removes ONLY:
    - Dates like "Tirsdag den 2. april 2024"
    - Page numbers like "(L 123) 5" 
    - Times like "Kl. 14.13"
    - Fix minister titles with hyphens
    """
    # 1. Remove dates (strict patterns)
    date_patterns = [
        r'[A-Za-z]+ den \d+\. [A-Za-z]+ \d{4}\b',  # "Tirsdag den 2. april 2024"
        r'[A-Za-z]+, \d+\. [A-Za-z]+ \d{4}\b',     # "Tirsdag, 2. april 2024"
        r'\d+\. [A-Za-z]+ \d{4}\b',                # "2. april 2024"
    ]

    # 2. Remove page references
    page_patterns = [
        r'\([A-Z] \d+\) \d+\b',  # "(L 123) 5"
        r'\([A-Z] \d+\)\b',      # "(L 123)"
    ]
    
    # 3. Remove times like "Kl. 14.13", "13:06", "kl 13:06", or "kl. 13.06"
    time_patterns = [
        r'\b[Kk]l\.?\s?\d{1,2}[:\.]\d{2}\b',  # Matches "Kl. 14.13", "kl 13:06", "kl. 13.06"
        r'\b\d{1,2}[:\.]\d{2}\b'               # Matches "13:06" or "13.06"
    ]

    # Apply removal patterns
    for pattern in date_patterns + page_patterns + time_patterns:
        text = re.sub(pattern, '', text)

    # 4. Fix hyphenated word splits, but keep "- og"
    text = re.sub(r'(\w+)\s*-\s*(?!og\b)(\w+)', r'\1\2', text)
    
    def normalize_minister_title(match):
        # Split and capitalize the words before "og"
        parts = re.split(r'-\s*,?\s*', match.group(1))
        parts = [p.capitalize() for p in parts if p]

        # Capitalize the word after "og"
        last = match.group(2).capitalize()
        name_and_colon = match.group(3)

        # Recombine all with capitalized "Og"
        title = ' '.join(parts) + ' Og ' + last
        return f"{title}{name_and_colon}"

    text = re.sub(
        r'\b([\wæøåÆØÅ\-]+(?:-\s*,?\s*[\wæøåÆØÅ\-]+)*)\s*-\s*og\s+([\wæøåÆØÅ\-]+)(\s*\([^)]+\):)',
        normalize_minister_title,
        text
    )

    # 6. Clean up formatting artifacts
    text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces
    text = re.sub(r'\s([,\.])', r'\1', text)  # Fix punctuation spacing
    
    return text.strip()


def clean_all_txt_files(directory_path, output_directory=None):
    """
    Apply clean_text_formatting to all .txt files in a directory.

    Args:
        directory_path (str): Path to directory with .txt files.
        output_directory (str, optional): Where to save cleaned files. If None, overwrites input files.
    """
    if output_directory:
        os.makedirs(output_directory, exist_ok=True)

    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            input_path = os.path.join(directory_path, filename)
            with open(input_path, 'r', encoding='utf-8') as f:
                raw_text = f.read()
            cleaned_text = clean_text_formatting(raw_text)
            
            output_path = (
                os.path.join(output_directory, filename)
                if output_directory else input_path
            )
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(cleaned_text)

In [75]:
def clean_single_txt_file(file_path):
    """
    Apply clean_text_formatting to a single .txt file.

    Args:
        file_path (str): Path to the .txt file.
        
    Returns:
        str: Cleaned text from the file.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()
    
    cleaned_text = clean_text_formatting(raw_text)
    
    return cleaned_text

In [76]:
# test
test = '''kl 13:05 For desværre finder vi 6 Tirsdag den 2. april 2024 (L 123) rester af pesticider i halvdelen af de danske drikkevandsboringer.. Kl. 13:01 Forhandling Formanden'''
print(clean_text_formatting(test))

test2 = '''hyppig- hed og Miljø- og sundhedsministeren (Lise): men jeg ved ikke med miljø- og sundhedsministeren. Klima-, energi- og Bygningsministeren (Name):'''
print(clean_text_formatting(test2))

For desværre finder vi 6 (L 123) rester af pesticider i halvdelen af de danske drikkevandsboringer.. Forhandling Formanden
hyppighed og Miljø Og Sundhedsministeren (Lise): men jeg ved ikke med miljø- og sundhedsministeren. Klima Energi Og Bygningsministeren (Name):


In [77]:
clean_all_txt_files("data/txt", 'data/cleaned')

In [78]:
# change file structure
def sort_txt_files_by_year(input_folder):
    """
    Sorts .txt files into subfolders based on the 4-digit year in the filename.
    
    Args:
        input_folder (str): Path to the folder containing the .txt files.
    """
    for filename in os.listdir(input_folder):
        if filename.lower().endswith('.txt'):
            try:
                year = filename[0:4]  # take first 4 chars (year)
                if year.isdigit():
                    year_folder = os.path.join(input_folder, year)
                    os.makedirs(year_folder, exist_ok=True)

                    old_path = os.path.join(input_folder, filename)
                    new_path = os.path.join(year_folder, filename)

                    shutil.move(old_path, new_path)
                    print(f"Moved {filename} to {year_folder}")
                else:
                    print(f"⚠️ Year not valid in: {filename}")
            except IndexError:
                print(f"⚠️ Filename too short: {filename}")

    print("\n✅ All files sorted by year!")


In [79]:
sort_txt_files_by_year('data/cleaned')

Moved 20231_m71.txt to data/cleaned/2023
Moved 20012_m15.txt to data/cleaned/2001
Moved 20231_m65.txt to data/cleaned/2023
Moved 20141_m34.txt to data/cleaned/2014
Moved 20161_m98.txt to data/cleaned/2016
Moved 20191_m106.txt to data/cleaned/2019
Moved 20151_m20.txt to data/cleaned/2015
Moved 20151_m34.txt to data/cleaned/2015
Moved 20171_m98.txt to data/cleaned/2017
Moved 20012_m29.txt to data/cleaned/2001
Moved 20141_m20.txt to data/cleaned/2014
Moved 20231_m59.txt to data/cleaned/2023
Moved 20191_m112.txt to data/cleaned/2019
Moved 20081_m19.txt to data/cleaned/2008
Moved 20161_m67.txt to data/cleaned/2016
Moved 20171_m73.txt to data/cleaned/2017
Moved 20171_m67.txt to data/cleaned/2017
Moved 20241_m2.txt to data/cleaned/2024
Moved 20091_m19.txt to data/cleaned/2009
Moved 20161_m73.txt to data/cleaned/2016
Moved 20211_m22.txt to data/cleaned/2021
Moved 20081_m25.txt to data/cleaned/2008
Moved 20201_m36.txt to data/cleaned/2020
Moved 20091_m31.txt to data/cleaned/2009
Moved 20091_m25