# Dataset Column Header Conversion

This notebook contains functions to convert the column headers in the meal planning dataset from the original format with units in parentheses to a simplified lowercase format without units.

## Original Format:

- `_id, Food_Item, Category, Calories (kcal), Protein (g), Carbohydrates (g), Fat (g), Fiber (g), Sugars (g), Sodium (mg), Cholesterol (mg), Meal_Type, Water_Intake (ml)`

## New Format:

- `id, food_item, category, calories, proteins, carbohydrates, fats, fibers, sugars, sodium, cholesterol, meal_type, water_intake`


In [37]:
import pandas as pd
import os

def clean_dataset_with_mapping(input_file_path, output_file_path=None):
    """
    Clean dataset by applying the specified column mapping and removing unspecified columns.
    
    Target columns: food_item, category, calories, proteins, carbohydrates, fats, 
                   fibers, sugars, sodium, cholesterol, meal_type, water_intake
    """
    
    # Define the standardized column mapping - removing _id and only keeping specified columns
    column_mapping = {
        'Food_Item': 'food_item',
        'Category': 'category', 
        'Calories (kcal)': 'calories',
        'Protein (g)': 'proteins',
        'Carbohydrates (g)': 'carbohydrates',
        'Fat (g)': 'fats',
        'Fiber (g)': 'fibers',
        'Sugars (g)': 'sugars',
        'Sodium (mg)': 'sodium',
        'Cholesterol (mg)': 'cholesterol',
        'Meal_Type': 'meal_type',
        'Water_Intake (ml)': 'water_intake'
    }
    
    # Additional mappings for different dataset formats
    extended_mapping = {
        # Food_1.csv format
        'name': 'food_item',
        'calories': 'calories',
        'protein': 'proteins',
        'carbohydrates': 'carbohydrates',
        'fat': 'fats',
        'fiber': 'fibers',
        'sugar': 'sugars',
        
        # Food_3.csv format
        'Food': 'food_item',
        'Calories-kcl': 'calories',
        'Protein-g': 'proteins',
        'Carb-g': 'carbohydrates',
        'Fiber-g': 'fibers',
        'Sugar-g': 'sugars',
        'Sodium-g': 'sodium',
        
        # Food_4.csv format
        'Shrt_Desc': 'food_item',
        'Energ_Kcal': 'calories',
        'Protein_(g)': 'proteins',
        'Carbohydrt_(g)': 'carbohydrates',
        'Lipid_Tot_(g)': 'fats',
        'Fiber_TD_(g)': 'fibers',
        'Sugar_Tot_(g)': 'sugars',
        'Sodium_(mg)': 'sodium',
        'Cholestrl_(mg)': 'cholesterol',
        
        # Food_5.csv format
        'name': 'food_item',
        'nutri_energy': 'calories',
        'nutri_protein': 'proteins',
        'nutri_carbohydrate': 'carbohydrates',
        'nutri_fat': 'fats',
        'nutri_fiber': 'fibers',
        'nutri_sugar': 'sugars',
        'nutri_salt': 'sodium'
    }
    
    # Combine all mappings
    all_mappings = {**column_mapping, **extended_mapping}
    
    # Target columns we want to keep
    target_columns = list(column_mapping.values())
    
    # Read the CSV file with encoding handling
    print(f"Reading file: {input_file_path}")
    
    # Try different encodings
    encodings_to_try = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
    df = None
    
    for encoding in encodings_to_try:
        try:
            df = pd.read_csv(input_file_path, encoding=encoding)
            print(f"Successfully read with encoding: {encoding}")
            break
        except UnicodeDecodeError:
            continue
    
    if df is None:
        raise ValueError(f"Could not read file {input_file_path} with any of the attempted encodings: {encodings_to_try}")
    
    print(f"Original columns: {list(df.columns)}")
    print(f"Dataset shape: {df.shape}")
    
    # Apply column mapping
    df_renamed = df.rename(columns=all_mappings)
    
    # Keep only the target columns that exist in the dataset
    available_target_cols = [col for col in target_columns if col in df_renamed.columns]
    df_cleaned = df_renamed[available_target_cols].copy()
    
    print(f"Available target columns: {available_target_cols}")
    print(f"Cleaned dataset shape: {df_cleaned.shape}")
    
    # Set output file path if not provided
    if output_file_path is None:
        base_name = os.path.splitext(os.path.basename(input_file_path))[0]
        output_dir = os.path.dirname(input_file_path)
        output_file_path = os.path.join(output_dir, f"{base_name}_cleaned.csv")
    
    # Save the cleaned dataset
    df_cleaned.to_csv(output_file_path, index=False)
    print(f"Cleaned dataset saved to: {output_file_path}")
    print(f"Final columns: {list(df_cleaned.columns)}")
    print("="*50)
    
    return df_cleaned

def batch_clean_datasets(folder_path):
    """
    Clean all CSV files in the specified folder using the standardized column mapping.
    """
    print(f"Starting batch cleaning of datasets in: {folder_path}")
    print("="*60)
    
    # Find all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv') and not f.endswith('_cleaned.csv')]
    
    if not csv_files:
        print("No CSV files found in the specified folder.")
        return
    
    cleaned_datasets = {}
    
    for csv_file in csv_files:
        input_path = os.path.join(folder_path, csv_file)
        print(f"\nProcessing: {csv_file}")
        
        try:
            cleaned_df = clean_dataset_with_mapping(input_path)
            cleaned_datasets[csv_file] = cleaned_df
            print(f"✓ Successfully cleaned {csv_file}")
        except Exception as e:
            print(f"✗ Error cleaning {csv_file}: {str(e)}")
    
    print("\n" + "="*60)
    print("BATCH CLEANING SUMMARY:")
    print(f"Total files processed: {len(cleaned_datasets)}")
    
    for filename, df in cleaned_datasets.items():
        print(f"- {filename}: {df.shape[0]} rows, {df.shape[1]} columns")
        print(f"  Columns: {list(df.columns)}")
    
    return cleaned_datasets

def combine_all_cleaned_datasets(folder_path, output_filename='combined_food_dataset.csv'):
    """
    Combine all cleaned datasets into one unified dataset.
    """
    import glob
    
    # Find all cleaned CSV files
    cleaned_files = glob.glob(os.path.join(folder_path, '*_cleaned.csv'))
    
    if not cleaned_files:
        print("No cleaned files found. Please run the cleaning process first.")
        return None
    
    print(f"Found {len(cleaned_files)} cleaned files to combine:")
    for file in cleaned_files:
        print(f"  - {os.path.basename(file)}")
    
    combined_dfs = []
    total_rows = 0
    
    # Read and combine all cleaned files
    for file_path in cleaned_files:
        filename = os.path.basename(file_path)
        print(f"\nReading {filename}...")
        
        try:
            df = pd.read_csv(file_path)
            # Add source file information
            df['source_file'] = filename.replace('_cleaned.csv', '.csv')
            combined_dfs.append(df)
            total_rows += len(df)
            print(f"  ✓ Added {len(df)} rows from {filename}")
        except Exception as e:
            print(f"  ✗ Error reading {filename}: {str(e)}")
    
    if not combined_dfs:
        print("No data could be read from cleaned files.")
        return None
    
    # Combine all dataframes
    print(f"\nCombining {len(combined_dfs)} datasets...")
    combined_df = pd.concat(combined_dfs, ignore_index=True)
    
    # Reorder columns to put source_file at the end
    cols = [col for col in combined_df.columns if col != 'source_file'] + ['source_file']
    combined_df = combined_df[cols]
    
    # Save combined dataset
    output_path = os.path.join(folder_path, output_filename)
    combined_df.to_csv(output_path, index=False)
    
    print(f"\n🎉 COMBINED DATASET CREATED:")
    print(f"  📁 File: {output_path}")
    print(f"  📊 Total rows: {len(combined_df):,}")
    print(f"  📋 Columns: {list(combined_df.columns)}")
    print(f"  🔗 Source files: {combined_df['source_file'].unique()}")
    
    # Show distribution by source file
    print(f"\n📈 DISTRIBUTION BY SOURCE:")
    source_counts = combined_df['source_file'].value_counts()
    for source, count in source_counts.items():
        print(f"  - {source}: {count:,} rows ({count/len(combined_df)*100:.1f}%)")
    
    return combined_df

In [38]:
# Execute batch cleaning on all files in the childs folder
childs_folder_path = '../../dataset/childs/'

print("Starting batch cleaning of all datasets in the childs folder...")
print("This will apply the standardized column mapping and remove any unspecified columns.")
print("\nTarget columns: food_item, category, calories, proteins, carbohydrates, fats, fibers, sugars, sodium, cholesterol, meal_type, water_intake")
print("\n" + "="*80)

# Run the batch cleaning
cleaned_datasets = batch_clean_datasets(childs_folder_path)

Starting batch cleaning of all datasets in the childs folder...
This will apply the standardized column mapping and remove any unspecified columns.

Target columns: food_item, category, calories, proteins, carbohydrates, fats, fibers, sugars, sodium, cholesterol, meal_type, water_intake

Starting batch cleaning of datasets in: ../../dataset/childs/

Processing: Food_1.csv
Reading file: ../../dataset/childs/Food_1.csv
Successfully read with encoding: utf-8
Original columns: ['Unnamed: 0', 'name', 'cook_time_minutes', 'country', 'user_ratings', 'description', 'fiber', 'protein', 'fat', 'calories', 'sugar', 'carbohydrates']
Dataset shape: (38, 12)
Available target columns: ['food_item', 'calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars']
Cleaned dataset shape: (38, 7)
Cleaned dataset saved to: ../../dataset/childs\Food_1_cleaned.csv
Final columns: ['food_item', 'calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars']
✓ Successfully cleaned Food_1.csv

Processi

In [39]:
# Verify the cleaned datasets
if 'cleaned_datasets' in locals():
    print("DETAILED VERIFICATION OF CLEANED DATASETS:")
    print("="*60)
    
    for filename, df in cleaned_datasets.items():
        print(f"\n📁 {filename}")
        print(f"Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        
        # Show first few rows
        print("\nFirst 3 rows:")
        print(df.head(3).to_string())
        
        # Check data types
        print("\nData types:")
        for col in df.columns:
            print(f"  {col}: {df[col].dtype}")
        
        # Check for missing values
        missing_counts = df.isnull().sum()
        if missing_counts.sum() > 0:
            print("\nMissing values:")
            for col, count in missing_counts.items():
                if count > 0:
                    print(f"  {col}: {count}")
        else:
            print("\n✓ No missing values")
        
        print("-" * 50)
else:
    print("No cleaned datasets found. Please run the batch cleaning first.")

DETAILED VERIFICATION OF CLEANED DATASETS:

📁 Food_1.csv
Shape: (38, 7)
Columns: ['food_item', 'calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars']

First 3 rows:
                           food_item  calories  proteins  carbohydrates  fats  fibers  sugars
0           Tomato And Anchovy Pasta       755        24            109    22      19       9
1            Blueberry Cream Muffins       264         4             32    13       0      13
2  One-Pot Lemon Garlic Shrimp Pasta       678        38             49    37       4       2

Data types:
  food_item: object
  calories: int64
  proteins: int64
  carbohydrates: int64
  fats: int64
  fibers: int64
  sugars: int64

✓ No missing values
--------------------------------------------------

📁 Food_2.csv
Shape: (10000, 12)
Columns: ['food_item', 'category', 'calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars', 'sodium', 'cholesterol', 'meal_type', 'water_intake']

First 3 rows:
        food_item category  c

In [40]:
def convert_to_api_format(df, selected_rows=None):
    """
    Convert dataframe rows to the API response format.
    Returns only the nutritional data as specified in the API format.
    """
    if selected_rows is not None:
        df_subset = df.iloc[selected_rows]
    else:
        df_subset = df
    
    api_format_data = []
    
    for _, row in df_subset.iterrows():
        item = {
            "fats": float(row.get("fats", 0)),
            "calories": float(row.get("calories", 0)),
            "sugars": float(row.get("sugars", 0)),
            "proteins": float(row.get("proteins", 0)),
            "fibers": float(row.get("fibers", 0)),
            "sodium": float(row.get("sodium", 0)),
            "cholesterol": float(row.get("cholesterol", 0)),
            "carbohydrates": float(row.get("carbohydrates", 0))
        }
        api_format_data.append(item)
    
    return api_format_data

# Test the API format conversion with available cleaned datasets
if 'cleaned_datasets' in locals() and cleaned_datasets:
    print("Sample API format output:")
    # Use the first available dataset for testing
    first_dataset_name = list(cleaned_datasets.keys())[0]
    first_dataset = cleaned_datasets[first_dataset_name]
    
    print(f"Using dataset: {first_dataset_name}")
    
    # Test with first 3 rows if available
    num_rows = min(3, len(first_dataset))
    if num_rows > 0:
        sample_api_data = convert_to_api_format(first_dataset, list(range(num_rows)))
        for i, item in enumerate(sample_api_data):
            print(f"Row {i+1}: {item}")
    else:
        print("No data available in the dataset.")
else:
    print("No cleaned datasets available. Please run the batch cleaning first.")

Sample API format output:
Using dataset: Food_1.csv
Row 1: {'fats': 22.0, 'calories': 755.0, 'sugars': 9.0, 'proteins': 24.0, 'fibers': 19.0, 'sodium': 0.0, 'cholesterol': 0.0, 'carbohydrates': 109.0}
Row 2: {'fats': 13.0, 'calories': 264.0, 'sugars': 13.0, 'proteins': 4.0, 'fibers': 0.0, 'sodium': 0.0, 'cholesterol': 0.0, 'carbohydrates': 32.0}
Row 3: {'fats': 37.0, 'calories': 678.0, 'sugars': 2.0, 'proteins': 38.0, 'fibers': 4.0, 'sodium': 0.0, 'cholesterol': 0.0, 'carbohydrates': 49.0}


In [41]:
# Check the cleaned files that were saved to disk
import glob

childs_folder = '../../dataset/childs/'
cleaned_files = glob.glob(os.path.join(childs_folder, '*_cleaned.csv'))

print("CLEANED FILES SAVED TO DISK:")
print("="*40)

if cleaned_files:
    for file_path in cleaned_files:
        filename = os.path.basename(file_path)
        file_size = os.path.getsize(file_path)
        
        # Read the file to check its structure
        df_check = pd.read_csv(file_path)
        
        print(f"\n📄 {filename}")
        print(f"   Size: {file_size:,} bytes")
        print(f"   Shape: {df_check.shape}")
        print(f"   Columns: {list(df_check.columns)}")
        
        # Show sample of the data
        if len(df_check) > 0:
            print(f"   Sample row: {df_check.iloc[0].to_dict()}")
else:
    print("No cleaned files found on disk.")

print("\n" + "="*40)
print("CLEANING PROCESS COMPLETED SUCCESSFULLY! ✅")
print("All datasets have been standardized with the specified column mapping.")
print("Original files remain unchanged. Cleaned versions saved with '_cleaned' suffix.")

CLEANED FILES SAVED TO DISK:

📄 Food_1_cleaned.csv
   Size: 1,839 bytes
   Shape: (38, 7)
   Columns: ['food_item', 'calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars']
   Sample row: {'food_item': 'Tomato And Anchovy Pasta', 'calories': 755, 'proteins': 24, 'carbohydrates': 109, 'fats': 22, 'fibers': 19, 'sugars': 9}

📄 Food_2_cleaned.csv
   Size: 624,010 bytes
   Shape: (10000, 12)
   Columns: ['food_item', 'category', 'calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars', 'sodium', 'cholesterol', 'meal_type', 'water_intake']
   Sample row: {'food_item': 'Eggs', 'category': 'Meat', 'calories': 173, 'proteins': 42.4, 'carbohydrates': 83.7, 'fats': 1.5, 'fibers': 1.5, 'sugars': 12.7, 'sodium': 752, 'cholesterol': 125, 'meal_type': 'Lunch', 'water_intake': 478}

📄 Food_3_cleaned.csv
   Size: 23,687 bytes
   Shape: (656, 7)
   Columns: ['food_item', 'calories', 'proteins', 'carbohydrates', 'fibers', 'sugars', 'sodium']
   Sample row: {'food_item': "Andrea'S"

In [42]:
# Re-run the batch cleaning with encoding fix for Food_4.csv
print("RE-RUNNING BATCH CLEANING WITH ENCODING FIX:")
print("="*60)

# Run the batch cleaning again
cleaned_datasets = batch_clean_datasets(childs_folder_path)

print("\n" + "="*60)
print("CLEANING VERIFICATION:")

# Check if Food_4.csv was processed successfully
if 'Food_4.csv' in cleaned_datasets:
    print("✅ Food_4.csv was successfully cleaned!")
    food4_df = cleaned_datasets['Food_4.csv']
    print(f"   Shape: {food4_df.shape}")
    print(f"   Columns: {list(food4_df.columns)}")
    print(f"   Sample data: {food4_df.head(2).to_dict('records')}")
else:
    print("❌ Food_4.csv was not processed successfully")

RE-RUNNING BATCH CLEANING WITH ENCODING FIX:
Starting batch cleaning of datasets in: ../../dataset/childs/

Processing: Food_1.csv
Reading file: ../../dataset/childs/Food_1.csv
Successfully read with encoding: utf-8
Original columns: ['Unnamed: 0', 'name', 'cook_time_minutes', 'country', 'user_ratings', 'description', 'fiber', 'protein', 'fat', 'calories', 'sugar', 'carbohydrates']
Dataset shape: (38, 12)
Available target columns: ['food_item', 'calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars']
Cleaned dataset shape: (38, 7)
Cleaned dataset saved to: ../../dataset/childs\Food_1_cleaned.csv
Final columns: ['food_item', 'calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars']
✓ Successfully cleaned Food_1.csv

Processing: Food_2.csv
Reading file: ../../dataset/childs/Food_2.csv
Successfully read with encoding: utf-8
Original columns: ['Date', 'User_ID', 'Food_Item', 'Category', 'Calories (kcal)', 'Protein (g)', 'Carbohydrates (g)', 'Fat (g)', 'Fiber (g)', 'S

In [43]:
# Combine all cleaned datasets into one unified dataset
print("\n" + "="*80)
print("COMBINING ALL CLEANED DATASETS INTO ONE UNIFIED DATASET")
print("="*80)

# Combine all cleaned datasets
combined_dataset = combine_all_cleaned_datasets(childs_folder_path, 'unified_food_dataset.csv')

if combined_dataset is not None:
    print("\n🔍 UNIFIED DATASET ANALYSIS:")
    print(f"Total unique food items: {combined_dataset['food_item'].nunique() if 'food_item' in combined_dataset.columns else 'N/A'}")
    
    # Show summary statistics for nutritional columns
    nutrition_cols = ['calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars', 'sodium', 'cholesterol']
    available_nutrition_cols = [col for col in nutrition_cols if col in combined_dataset.columns]
    
    if available_nutrition_cols:
        print(f"\n📊 NUTRITIONAL SUMMARY STATISTICS:")
        print(combined_dataset[available_nutrition_cols].describe().round(2))
    
    # Show sample of the unified dataset
    print(f"\n📄 SAMPLE OF UNIFIED DATASET (First 5 rows):")
    print(combined_dataset.head().to_string())
    
    print("\n✨ SUCCESS! All datasets have been cleaned and combined into one unified dataset.")
else:
    print("❌ Failed to create unified dataset.")


COMBINING ALL CLEANED DATASETS INTO ONE UNIFIED DATASET
Found 5 cleaned files to combine:
  - Food_1_cleaned.csv
  - Food_2_cleaned.csv
  - Food_3_cleaned.csv
  - Food_4_cleaned.csv
  - Food_5_cleaned.csv

Reading Food_1_cleaned.csv...
  ✓ Added 38 rows from Food_1_cleaned.csv

Reading Food_2_cleaned.csv...
  ✓ Added 10000 rows from Food_2_cleaned.csv

Reading Food_3_cleaned.csv...
  ✓ Added 656 rows from Food_3_cleaned.csv

Reading Food_4_cleaned.csv...
  ✓ Added 8790 rows from Food_4_cleaned.csv

Reading Food_5_cleaned.csv...
  ✓ Added 1460 rows from Food_5_cleaned.csv

Combining 5 datasets...

🎉 COMBINED DATASET CREATED:
  📁 File: ../../dataset/childs/unified_food_dataset.csv
  📊 Total rows: 20,944
  📋 Columns: ['food_item', 'calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars', 'category', 'sodium', 'cholesterol', 'meal_type', 'water_intake', 'source_file']
  🔗 Source files: ['Food_1.csv' 'Food_2.csv' 'Food_3.csv' 'Food_4.csv' 'Food_5.csv']

📈 DISTRIBUTION BY SOURCE: