In [6]:
import pandas as pd

# Load dataset (replace this with your file if needed)
df = pd.read_csv("/content/usda_1000_nutrition_data.csv")

print("📄 Original shape:", df.shape)

# Step 1: Drop duplicates
df = df.drop_duplicates()
print("✅ After removing duplicates:", df.shape)

# Step 2: Standardize text in 'description'
df['description'] = df['description'].str.strip().str.lower()

# Step 3: Handle missing values
# For numerical columns: fill with 0 or mean
num_cols = ['energy_kcal', 'protein_g', 'fat_g', 'carbohydrates_g', 'fiber_g', 'sugar_g']
df[num_cols] = df[num_cols].fillna(0)  # Change to .mean() if needed

# For 'description', fill missing with "unknown"
df['description'] = df['description'].fillna("unknown food")

# Step 4: Filter invalid (e.g., negative values or impossible nutrition data)
for col in num_cols:
    df = df[df[col] >= 0]

# Step 5: Optional – rename columns for clarity
df.rename(columns={
    'energy_kcal': 'calories',
    'protein_g': 'protein (g)',
    'fat_g': 'fat (g)',
    'carbohydrates_g': 'carbs (g)',
    'fiber_g': 'fiber (g)',
    'sugar_g': 'sugar (g)'
}, inplace=True)

# Step 6: Reindex
df.reset_index(drop=True, inplace=True)

# Step 7: Save cleaned dataset
df.to_csv("usda_1000_nutrition_data_cleaned.csv", index=False)
print("💾 Cleaned dataset saved as 'usda_1000_nutrition_data_cleaned.csv'")
print("✅ Final shape:", df.shape)

# Preview
print("\n🔍 Sample rows:\n", df.head())


📄 Original shape: (1000, 8)
✅ After removing duplicates: (1000, 8)
💾 Cleaned dataset saved as 'usda_1000_nutrition_data_cleaned.csv'
✅ Final shape: (1000, 8)

🔍 Sample rows:
   description    fdcId  calories  protein (g)  fat (g)  carbs (g)  fiber (g)  \
0       chili  1861843     102.0         6.78     5.08       7.20        0.8   
1       chili  2128385     246.0        14.00    17.50       7.02        5.3   
2       chili  1953562     110.0         6.78     3.81      11.90        1.3   
3       chili  2115157     122.0        10.40     4.35      10.90        3.5   
4       chili  2030019      97.0         5.93     5.08       6.78        0.8   

   sugar (g)  
0          0  
1          0  
2          0  
3          0  
4          0  
