In [None]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = '/content/nutritionix_bulk_data.csv'  # Update if needed
df = pd.read_csv(file_path)

print(f"🔎 Initial dataset shape: {df.shape}")

# -------------------------------
# 1. Check & Report Missing Values
# -------------------------------

missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Values': missing_counts,
    'Percentage (%)': missing_percent
})
print("\n📋 Missing Values Report:")
print(missing_df[missing_df['Missing Values'] > 0])

# -------------------------------
# 2. Fill Missing Numeric Values with Random Numbers
# -------------------------------

numeric_cols = df.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    num_missing = df[col].isnull().sum()
    if num_missing > 0:
        # Generate random values within the range of existing values
        col_min, col_max = df[col].min(), df[col].max()
        random_values = np.random.uniform(low=col_min, high=col_max, size=num_missing)
        df.loc[df[col].isnull(), col] = random_values
        print(f"✅ Replaced {num_missing} missing values in numeric column '{col}' with random values.")

# -------------------------------
# 3. Fill Missing Categorical Values (Optional)
# -------------------------------

categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    num_missing = df[col].isnull().sum()
    if num_missing > 0:
        existing_values = df[col].dropna().unique()
        if len(existing_values) > 0:
            random_choices = np.random.choice(existing_values, size=num_missing)
            df.loc[df[col].isnull(), col] = random_choices
            print(f"✅ Replaced {num_missing} missing values in categorical column '{col}' with random existing values.")

# -------------------------------
# 4. Remove Duplicate Rows
# -------------------------------

duplicate_count = df.duplicated().sum()
print(f"\n🔁 Duplicate rows found: {duplicate_count}")

if duplicate_count > 0:
    df.drop_duplicates(inplace=True)
    print(f"✅ Removed duplicates. New shape: {df.shape}")

# -------------------------------
# 5. Save the Cleaned Dataset
# -------------------------------

output_path = '/content/cleaned_dataset.csv'
df.to_csv(output_path, index=False)
print(f"\n💾 Cleaned dataset saved to: {output_path}")




