In [2]:
#!/usr/bin/env python3
"""
Clean food dataset by converting all-caps food names to proper title case
"""

import pandas as pd
import re
import os
from pathlib import Path

def clean_food_name(food_name):
    """
    Convert all-caps food names to proper title case
    Preserve brand names and proper nouns in their original case
    """
    # Skip if food name is not primarily uppercase or already properly formatted
    if not food_name.isupper() or len(food_name) < 3:
        return food_name
    
    # Convert to title case
    cleaned_name = food_name.title()
    
    # Handle special cases and common abbreviations
    replacements = {
        'W/': 'w/',
        'W ': 'w ',
        'Lt ': 'lt ',
        'Lt,': 'lt,',
        'Ckd': 'Cooked',
        'Cnd': 'Canned',
        'Mxd': 'Mixed',
        'Sp': 'Species',
        'Reg': 'Regular',
        'Bev': 'Beverage',
        'Crm': 'Cream',
        'Lemonade': 'Lemonade',
        'Ice Cream': 'Ice Cream',
        '(Alaska Native)': '(Alaska Native)',
        '(Alaskan Ice Cream)': '(Alaskan Ice Cream)',
        '(Southwest)': '(Southwest)',
        '(Apache)': '(Apache)',
        'Distilled': 'Distilled',
        'Alcoholic': 'Alcoholic',
        'Proof': 'Proof'
    }
    
    # Apply replacements
    for old, new in replacements.items():
        cleaned_name = cleaned_name.replace(old, new)
    
    # Fix common patterns
    cleaned_name = re.sub(r'\b([A-Za-z]+),([A-Za-z])', r'\1, \2', cleaned_name)  # Add space after comma
    cleaned_name = re.sub(r'\bAnd\b', 'and', cleaned_name)  # Lowercase 'and'
    cleaned_name = re.sub(r'\bOf\b', 'of', cleaned_name)    # Lowercase 'of'
    cleaned_name = re.sub(r'\bThe\b', 'the', cleaned_name)  # Lowercase 'the'
    cleaned_name = re.sub(r'\bWith\b', 'with', cleaned_name)  # Lowercase 'with'
    cleaned_name = re.sub(r'\bFor\b', 'for', cleaned_name)  # Lowercase 'for'
    cleaned_name = re.sub(r'\bIn\b', 'in', cleaned_name)    # Lowercase 'in'
    cleaned_name = re.sub(r'\bOn\b', 'on', cleaned_name)    # Lowercase 'on'
    
    return cleaned_name

def clean_dataset():
    """
    Clean the food dataset by fixing capitalization in food names
    """
    # File paths - using absolute paths
    input_file = r"D:\Code\Lychee\lychee-meal-planners\systems\dataset\product_dataset\food_dataset_with_ids.csv"
    output_file = r"D:\Code\Lychee\lychee-meal-planners\systems\dataset\product_dataset\food_dataset_with_ids_cleaned.csv"
    backup_file = r"D:\Code\Lychee\lychee-meal-planners\systems\dataset\product_dataset\food_dataset_with_ids_backup.csv"
    
    print("🧹 Starting food dataset cleaning...")
    print(f"📁 Input file: {input_file}")
    print(f"📁 Output file: {output_file}")
    
    try:
        # Read the dataset
        print("📖 Reading dataset...")
        df = pd.read_csv(input_file)
        print(f"📊 Dataset loaded: {len(df)} rows, {len(df.columns)} columns")
        
        # Create backup
        print("💾 Creating backup...")
        df.to_csv(backup_file, index=False)
        print(f"✅ Backup created: {backup_file}")
        
        # Count uppercase food names before cleaning
        uppercase_count = df['food_item'].apply(lambda x: x.isupper()).sum()
        print(f"🔍 Found {uppercase_count} all-caps food names to clean")
        
        # Clean food names
        print("🧽 Cleaning food names...")
        df['food_item'] = df['food_item'].apply(clean_food_name)
        
        # Count changes made
        df_check = pd.read_csv(input_file)
        changes_made = (df_check['food_item'] != df['food_item']).sum()
        print(f"✨ Successfully cleaned {changes_made} food names")
        
        # Save cleaned dataset
        print("💾 Saving cleaned dataset...")
        df.to_csv(output_file, index=False)
        print(f"✅ Cleaned dataset saved: {output_file}")
        
        # Show sample of changes
        print("\n📋 Sample of cleaned food names:")
        print("=" * 80)
        df_original = pd.read_csv(input_file)
        sample_changes = []
        
        for i in range(min(20, len(df))):
            original = df_original.iloc[i]['food_item']
            cleaned = df.iloc[i]['food_item']
            if original != cleaned:
                sample_changes.append((original, cleaned))
        
        for i, (original, cleaned) in enumerate(sample_changes[:10]):
            print(f"{i+1:2d}. Original: {original}")
            print(f"    Cleaned:  {cleaned}")
            print()
        
        if len(sample_changes) > 10:
            print(f"... and {len(sample_changes) - 10} more changes")
        
        print(f"\n🎉 Dataset cleaning completed successfully!")
        print(f"📊 Total changes made: {changes_made}")
        print(f"📁 Files created:")
        print(f"   • Cleaned dataset: {output_file}")
        print(f"   • Backup: {backup_file}")
        
    except FileNotFoundError:
        print(f"❌ Error: File '{input_file}' not found!")
        print("Make sure the file path is correct.")
    except Exception as e:
        print(f"❌ Error during cleaning: {str(e)}")
        
if __name__ == "__main__":
    clean_dataset()


🧹 Starting food dataset cleaning...
📁 Input file: D:\Code\Lychee\lychee-meal-planners\systems\dataset\product_dataset\food_dataset_with_ids.csv
📁 Output file: D:\Code\Lychee\lychee-meal-planners\systems\dataset\product_dataset\food_dataset_with_ids_cleaned.csv
📖 Reading dataset...
📊 Dataset loaded: 8681 rows, 10 columns
💾 Creating backup...
✅ Backup created: D:\Code\Lychee\lychee-meal-planners\systems\dataset\product_dataset\food_dataset_with_ids_backup.csv
🔍 Found 7525 all-caps food names to clean
🧽 Cleaning food names...
✨ Successfully cleaned 7525 food names
💾 Saving cleaned dataset...
✅ Cleaned dataset saved: D:\Code\Lychee\lychee-meal-planners\systems\dataset\product_dataset\food_dataset_with_ids_cleaned.csv

📋 Sample of cleaned food names:
 1. Original: ABALONE,MIXED SPECIES,RAW
    Cleaned:  Abalone, Mixed Speciesecies, Raw

 2. Original: ABALONE,MXD SP,CKD,FRIED
    Cleaned:  Abalone, Mixed Species, Cooked,Fried

 3. Original: ABIYUCH,RAW
    Cleaned:  Abiyuch, Raw

 4. Origina