In [1]:
import pandas as pd
import re
import unicodedata

# Load the CSV file
# Using utf-8 is generally best practice, though sometimes errors persist due to data quality.
file_path = 'Dataset.csv'
df = pd.read_csv(file_path, encoding='utf-8')

# Initial Cleaning: Fix Column Names (Remove Byte Order Mark/BOM)
df.columns = df.columns.str.replace('ï»¿', '', regex=False).str.strip()

# Defining the Robust Cleaning Function
def clean_non_latin_and_normalize(text):
    """
    Cleans a text string by normalizing it, converting it to ASCII (removing diacritics),
    and filtering out any remaining non-standard characters.
    """
    if pd.isna(text):
        return text

    # Convert to string
    text = str(text)

    # Normalize the string (separates base letter from diacritical marks)
    normalized_text = unicodedata.normalize('NFD', text)

    # Encode to ASCII, ignoring all characters that cannot be mapped.
    # This effectively removes diacritics and non-Latin script characters.
    ascii_text = normalized_text.encode('ascii', 'ignore').decode('utf-8')

    # Apply a strict regex to remove any remaining non-English characters/symbols,
    # keeping only Latin letters, numbers, spaces, and essential punctuation.
    # Allowed: a-zA-Z0-9\s.,-()\'/&@!#%*?
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s\.,\-\(\)\'\/&@!#%*?]', '', ascii_text)

    return cleaned_text.strip()

# Identify Categorical Fields (Object Dtypes)
object_cols = df.select_dtypes(include='object').columns.tolist()

print(f"Applying cleaning to {len(object_cols)} categorical columns: {object_cols}")

# Apply the improved cleaning function to all object columns
for col in object_cols:
    df[col] = df[col].apply(clean_non_latin_and_normalize)

# Verification (Optional but Recommended)
print("\n--- Verification ---")
print(df.head())

# Save the Cleaned DataFrame
output_file = 'cleaned_dataset.csv'
df.to_csv(output_file, index=False)
print(f"\nCleaned data saved to {output_file}")

Applying cleaning to 13 categorical columns: ['Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose', 'Cuisines', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Rating color', 'Rating text']

--- Verification ---
   Restaurant ID         Restaurant Name  Country Code              City  \
0        6317637        Le Petit Souffle           162       Makati City   
1        6304287        Izakaya Kikufuji           162       Makati City   
2        6300002  Heat - Edsa Shangri-La           162  Mandaluyong City   
3        6318506                    Ooma           162  Mandaluyong City   
4        6314302             Sambo Kojin           162  Mandaluyong City   

                                             Address  \
0  Third Floor, Century City Mall, Kalayaan Avenu...   
1  Little Tokyo, 2277 Chino Roces Avenue, Legaspi...   
2  Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...   
3  Third Floor, Mega Fashion Hall, SM