# Dataset Column Header Conversion

This notebook contains functions to convert the column headers in the meal planning dataset from the original format with units in parentheses to a simplified lowercase format without units.

## Original Format:
- `_id, Food_Item, Category, Calories (kcal), Protein (g), Carbohydrates (g), Fat (g), Fiber (g), Sugars (g), Sodium (mg), Cholesterol (mg), Meal_Type, Water_Intake (ml)`

## New Format:
- `id, food_item, category, calories, proteins, carbohydrates, fats, fibers, sugars, sodium, cholesterol, meal_type, water_intake`

In [1]:
import pandas as pd
import os

def convert_column_headers(input_file_path, output_file_path=None):
    """
    Convert column headers from the original format to simplified format.
    
    Original: _id,Food_Item,Category,Calories (kcal),Protein (g),Carbohydrates (g),Fat (g),Fiber (g),Sugars (g),Sodium (mg),Cholesterol (mg),Meal_Type,Water_Intake (ml)
    New: id,food_item,category,calories,proteins,carbohydrates,fats,fibers,sugars,sodium,cholesterol,meal_type,water_intake
    """
    
    # Define the column mapping
    column_mapping = {
        '_id': 'id',
        'Food_Item': 'food_item',
        'Category': 'category',
        'Calories (kcal)': 'calories',
        'Protein (g)': 'proteins',
        'Carbohydrates (g)': 'carbohydrates',
        'Fat (g)': 'fats',
        'Fiber (g)': 'fibers',
        'Sugars (g)': 'sugars',
        'Sodium (mg)': 'sodium',
        'Cholesterol (mg)': 'cholesterol',
        'Meal_Type': 'meal_type',
        'Water_Intake (ml)': 'water_intake'
    }
    
    # Read the CSV file
    print(f"Reading file: {input_file_path}")
    df = pd.read_csv(input_file_path)
    
    print(f"Original columns: {list(df.columns)}")
    print(f"Dataset shape: {df.shape}")
    
    # Rename columns
    df_renamed = df.rename(columns=column_mapping)
    
    print(f"New columns: {list(df_renamed.columns)}")
    
    # Set output file path if not provided
    if output_file_path is None:
        output_file_path = input_file_path.replace('.csv', '_cleaned.csv')
    
    # Save the cleaned dataset
    df_renamed.to_csv(output_file_path, index=False)
    print(f"Cleaned dataset saved to: {output_file_path}")
    
    return df_renamed

In [2]:
# Execute the conversion
input_file = '../dataset/daily_food_nutrition_dataset_with_ids.csv'
output_file = '../dataset/daily_food_nutrition_dataset_cleaned.csv'

# Convert the dataset
cleaned_df = convert_column_headers(input_file, output_file)

# Display first few rows of the cleaned dataset
print("\nFirst 5 rows of cleaned dataset:")
print(cleaned_df.head())

print("\nDataset info:")
print(cleaned_df.info())

Reading file: ../dataset/daily_food_nutrition_dataset_with_ids.csv
Original columns: ['_id', 'Food_Item', 'Category', 'Calories (kcal)', 'Protein (g)', 'Carbohydrates (g)', 'Fat (g)', 'Fiber (g)', 'Sugars (g)', 'Sodium (mg)', 'Cholesterol (mg)', 'Meal_Type', 'Water_Intake (ml)']
Dataset shape: (10000, 13)
New columns: ['id', 'food_item', 'category', 'calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars', 'sodium', 'cholesterol', 'meal_type', 'water_intake']
Cleaned dataset saved to: ../dataset/daily_food_nutrition_dataset_cleaned.csv

First 5 rows of cleaned dataset:
                         id       food_item category  calories  proteins  \
0  6843fa1e7fe66773fab3281d            Eggs     Meat       173      42.4   
1  6843fa1e7fe66773fab3281e           Apple   Fruits        66      39.2   
2  6843fa1e7fe66773fab3281f  Chicken Breast     Meat       226      27.1   
3  6843fa1e7fe66773fab32820          Banana   Fruits       116      43.4   
4  6843fa1e7fe66773fab32821      

In [3]:
# Verify the conversion by checking data types and sample values
print("Data types of nutritional columns:")
nutrition_cols = ['calories', 'proteins', 'carbohydrates', 'fats', 'fibers', 'sugars', 'sodium', 'cholesterol']
for col in nutrition_cols:
    if col in cleaned_df.columns:
        print(f"{col}: {cleaned_df[col].dtype} - Sample values: {cleaned_df[col].head(3).tolist()}")

print("\nUnique meal types:")
if 'meal_type' in cleaned_df.columns:
    print(cleaned_df['meal_type'].unique())

print("\nUnique categories:")
if 'category' in cleaned_df.columns:
    print(cleaned_df['category'].unique())

Data types of nutritional columns:
calories: int64 - Sample values: [173, 66, 226]
proteins: float64 - Sample values: [42.4, 39.2, 27.1]
carbohydrates: float64 - Sample values: [83.7, 13.8, 79.1]
fats: float64 - Sample values: [1.5, 3.2, 25.8]
fibers: float64 - Sample values: [1.5, 2.6, 3.2]
sugars: float64 - Sample values: [12.7, 12.2, 44.7]
sodium: int64 - Sample values: [752, 680, 295]
cholesterol: int64 - Sample values: [125, 97, 157]

Unique meal types:
['Lunch' 'Breakfast' 'Snack' 'Dinner']

Unique categories:
['Meat' 'Fruits' 'Grains' 'Vegetables' 'Snacks' 'Beverages' 'Dairy']


In [4]:
def convert_to_api_format(df, selected_rows=None):
    """
    Convert dataframe rows to the API response format.
    Returns only the nutritional data as specified in the API format.
    """
    if selected_rows is not None:
        df_subset = df.iloc[selected_rows]
    else:
        df_subset = df
    
    api_format_data = []
    
    for _, row in df_subset.iterrows():
        item = {
            "fats": float(row.get("fats", 0)),
            "calories": float(row.get("calories", 0)),
            "sugars": float(row.get("sugars", 0)),
            "proteins": float(row.get("proteins", 0)),
            "fibers": float(row.get("fibers", 0)),
            "sodium": float(row.get("sodium", 0)),
            "cholesterol": float(row.get("cholesterol", 0)),
            "carbohydrates": float(row.get("carbohydrates", 0))
        }
        api_format_data.append(item)
    
    return api_format_data

# Test the API format conversion with first 3 rows
print("Sample API format output:")
sample_api_data = convert_to_api_format(cleaned_df, [0, 1, 2])
for i, item in enumerate(sample_api_data):
    print(f"Row {i+1}: {item}")

Sample API format output:
Row 1: {'fats': 1.5, 'calories': 173.0, 'sugars': 12.7, 'proteins': 42.4, 'fibers': 1.5, 'sodium': 752.0, 'cholesterol': 125.0, 'carbohydrates': 83.7}
Row 2: {'fats': 3.2, 'calories': 66.0, 'sugars': 12.2, 'proteins': 39.2, 'fibers': 2.6, 'sodium': 680.0, 'cholesterol': 97.0, 'carbohydrates': 13.8}
Row 3: {'fats': 25.8, 'calories': 226.0, 'sugars': 44.7, 'proteins': 27.1, 'fibers': 3.2, 'sodium': 295.0, 'cholesterol': 157.0, 'carbohydrates': 79.1}
