In [2]:
import pandas as pd

# Step 1: Load the dataset
df = pd.read_csv("merged_output.csv")  # Replace with your actual filename

# Step 2: Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Step 3: Strip whitespace and unwanted characters from text columns
df['title'] = df['title'].str.strip()
df['ingredients'] = df['ingredients'].str.replace(r'[^\x00-\x7F]+', ' ', regex=True).str.strip()
df['url'] = df['url'].str.strip()

# Step 4: Convert numeric columns to proper types
numeric_cols = ['calories', 'fat', 'protein', 'carbs', 'servings']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Step 5: Drop rows with missing essential values (optional but useful)
df.dropna(subset=['title', 'ingredients', 'calories', 'servings'], inplace=True)

# Step 6: Reset index
df.reset_index(drop=True, inplace=True)

# Optional: Show clean dataframe
print(df.head())

# Optional: Save cleaned file
df.to_csv("cleaned_dataset.csv", index=False)


                                       title  \
0                              Stalker Pasta   
1                Vegan Wild Mushroom Lasagna   
2  RWOP Finalist: Tantalizing Tilapia Recipe   
3             Blue Cheese Portobello Burgers   
4       Pan-Grilled Portobello Mushroom Caps   

                                         ingredients     calories         fat  \
0  3 tbsp. olive oil; 2 oz. pancetta or regular b...  1577.696629  129.212526   
1  9 sheets of oven-ready, no-boil lasagna; 1 1/2...  4751.642910   99.368593   
2  2 tsp blackened seasoning; 1 tsp lemon pepper ...  4423.251579  355.047064   
3  3 tablespoons extra-virgin olive oil, divided;...  1345.744050   69.446261   
4  4 x portobello mushroom caps, the dry stem tri...   677.321018   61.410305   

      protein       carbs                                                url  \
0   80.654905   13.045006  https://www.delish.com/cooking/recipe-ideas/re...   
1  131.580270  817.181987  https://holycowvegan.net/vegan-wild-m