In [2]:
import os
import re
from collections import Counter

folder_path = "./"
cleaned_path = "./covid19_corpus/cleaned/"
os.makedirs(cleaned_path, exist_ok=True)

def clean_text(text):
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove HTML entities like &nbsp; &lt; etc.
    text = re.sub(r'&[a-zA-Z]+;', '', text)
    # Remove repeated separators or banners
    text = re.sub(r'[-_=]{3,}', '', text)
    return text

# Step 1: Collect all lines across files
all_lines = []
file_texts = {}

for file in os.listdir(folder_path):
    if file.endswith(".txt"):
        with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
            lines = [clean_text(line) for line in f if len(line.strip()) > 0]
            file_texts[file] = lines
            all_lines.extend(lines)

# Step 2: Identify duplicate lines across files (e.g., headers)
line_counts = Counter(all_lines)
common_lines = {line for line, count in line_counts.items() if count > len(file_texts) * 0.5}

print(f"Detected {len(common_lines)} common lines (likely headers/footers).")

# Step 3: Remove them and save cleaned files
for file, lines in file_texts.items():
    filtered = [line for line in lines if line not in common_lines]
    cleaned_text = "\n".join(filtered)
    with open(os.path.join(cleaned_path, file), "w", encoding="utf-8") as f:
        f.write(cleaned_text)


Detected 601 common lines (likely headers/footers).
