In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("epi_r.csv")

# 1. Standardize column names
df.columns = df.columns.str.strip().str.lower()

# 2. Drop rows with missing essential nutritional data
df = df.dropna(subset=['calories', 'protein', 'fat', 'sodium'])

# 3. Drop duplicate rows if any
df = df.drop_duplicates()

# 4. Drop columns with almost all zeros (very low variance)
threshold = 0.99  # Adjust if needed
zero_fraction = (df == 0).sum() / len(df)
low_variance_cols = zero_fraction[zero_fraction > threshold].index
df = df.drop(columns=low_variance_cols)

# 5. Reset index
df.reset_index(drop=True, inplace=True)

# Save the cleaned dataset
df.to_csv("epi_r_cleaned.csv", index=False)

print("Cleaned dataset saved as 'epi_r_cleaned.csv'")
