In [2]:
import pandas as pd
import re
import os

# --- 1. Set input and output paths dynamically ---
folders = ["participant_input", "Cuba"]
input_path = None
for folder in folders:
    potential_path = os.path.join(folder, "input.txt")
    if os.path.exists(potential_path):
        input_path = potential_path
        break

if input_path is None:
    raise FileNotFoundError("Could not find input.txt in any of the expected folders")

output_path = "output.txt"

# --- 2. Read the input file ---
with open(input_path, "r", encoding="utf-8") as f:
    lines = f.read().splitlines()

# Convert to pandas DataFrame for easier processing
df = pd.DataFrame(lines, columns=["text"])

# --- 3. Clean text ---
def clean_text(text):
    # Remove the word "stop" (case-insensitive)
    text = re.sub(r"\bstop\b", "", text, flags=re.IGNORECASE)
    # Remove commas and periods
    text = re.sub(r"[.,]", "", text)
    # Replace multiple spaces with single space
    text = re.sub(r"\s+", " ", text)
    # Strip leading/trailing spaces
    return text.strip()

df["cleaned"] = df["text"].apply(clean_text)

# --- 4. Save cleaned text to output.txt (one line per row) ---
with open(output_path, "w", encoding="utf-8") as f:
    for line in df["cleaned"]:
        f.write(line + "\n")

print(f"✅ Cleaned text saved to '{output_path}'")

✅ Cleaned text saved to 'output.txt'
