In [None]:
import pandas as pd
import os

# RAW_recipes.csv: Contains recipe details.
# RAW_interactions.csv: Contains user reviews and ratings for the recipes.
data_path = "../data/RAW_recipes.csv"
if os.path.exists(data_path):
    recipes_df = pd.read_csv(data_path)
    print("Dataset loaded successfully!")
else:
    raise FileNotFoundError(f"Dataset not found at {data_path}. Please ensure it exists.")

# Display dataset structure and first few rows
print("Dataset Structure:")
print(recipes_df.info())
print("\nSample Data:")
print(recipes_df.head())

# Check for missing values
missing_values = recipes_df.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# Basic statistics of numerical columns
print("\nBasic Statistics:")
print(recipes_df.describe())

# Save a subset of columns relevant for the recommender
columns_to_keep = ['id', 'name', 'minutes', 'tags', 'nutrition', 'n_steps', 'ingredients']
filtered_df = recipes_df[columns_to_keep]

# Save cleaned data for further processing
filtered_df.to_csv("../data/filtered_recipes.csv", index=False)
print("\nFiltered dataset saved as 'filtered_recipes.csv'.")
