In [None]:
import json
import re
from tqdm import tqdm

def clean_content(text):
    if not isinstance(text, str):
        return text

    # Remove [1], [2], etc.
    text = re.sub(r'\[\d+\]', '', text)

    # Remove citation needed or rujukan
    text = re.sub(r'\[.*?rujukan.*?\]', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE)

    # Remove DMS coordinates like 1°27′27.0″N 103°45′08.1″E
    text = re.sub(r'(\d{1,3}°\d{1,3}(?:′|\'|’)?\d{0,3}(?:″|")?\d*\.?\d*[NSEW]?\s*){2,}', '', text)

    # Remove decimal coordinates like 3.35000°N 101.82000°E or 3.35000; 101.82000
    text = re.sub(r'\d+\.\d+°?[NSEW]?\s*,?\s*\d+\.\d+°?[NSEW]?', '', text)
    text = re.sub(r'\d+\.\d+\s*;\s*\d+\.\d+', '', text)

    # Fix spacing issues (e.g. JohorBahru → Johor Bahru)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    # Remove invisible Unicode characters
    text = re.sub(r'[\u200e\u200f\u202a-\u202e\u2060\u00a0]', '', text)

    # Normalize whitespace and punctuation
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.\s*\.', '.', text)
    text = re.sub(r'\s+([.,;])', r'\1', text)

    return text.strip()

# Load the data
with open('tourism_original.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

cleaned_data = []

# Fields to remove
remove_keys = ['other_names', 'en_url', 'ms_url', 'official_website', 
               'coordinates', 'location', 'area', 'daerah']

for item in tqdm(data):
    # Skip if no en_content
    if not item.get("en_content") or not isinstance(item.get("en_content"), str) or len(item["en_content"].strip()) == 0:
        continue

    # Clean text fields
    item["ms_content"] = clean_content(item.get("ms_content", ""))
    item["en_content"] = clean_content(item.get("en_content", ""))

    # Remove unwanted fields
    for key in remove_keys:
        item.pop(key, None)

    cleaned_data.append(item)

# Save the cleaned version
with open("tourism_data.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

print(f"Cleaned dataset saved with {len(cleaned_data)} entries.")