## Task: prep_time_distribution

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

path = '/sandbox/data/cleaned_salad_data.csv'
df = pd.read_csv(path)

selected_cols = []
for col in ['prep_time', 'salad_title']:
    if col in df.columns:
        selected_cols.append(col)
df = df[selected_cols].copy()

if 'prep_time' in df.columns:
    df['prep_time'] = pd.to_numeric(df['prep_time'], errors='coerce')
    df = df.dropna(subset=['prep_time']).copy()
    plt.figure(figsize=(8, 6))
    if not df.empty:
        plt.hist(df['prep_time'], bins=20, color='steelblue', edgecolor='black')
        plt.xlabel('Preparation Time (minutes)')
        plt.ylabel('Count')
        plt.title('Prep Time Distribution')
    else:
        plt.xlabel('Preparation Time (minutes)')
        plt.ylabel('Count')
        plt.title('Prep Time Distribution (no data)')
    plt.savefig('prep_time_distribution.png')
    plt.close()
else:
    plt.figure()
    plt.xlabel('Preparation Time (minutes)')
    plt.ylabel('Count')
    plt.title('Prep Time Distribution (column missing)')
    plt.savefig('prep_time_distribution.png')
    plt.close()

### Plot
![](sandbox/output/prep_time_distribution_0c15_prep_time_distribution.png)

### Insight for prep_time_distribution

{
  "observation": "The prep_time distribution is right-skewed: the majority of salads require short preparation times (roughly under 30 minutes), with a long tail of longer recipes extending beyond 60 minutes and even up to several hours in the dataset. There are several outliers with very long prep_times, suggesting some complex or multi-step dishes. Some columns in the sample appear as strings while others are numeric, indicating potential data cleanliness issues.",
  "insight": "Most customers appear to prefer quick-to-prepare salads, which represents a clear opportunity to expand and promote a large catalog of recipes in the sub-30-minute range. The presence of longer, more elaborate salads adds variety and the potential for premium offerings, but these should be clearly categorized and marketed to avoid confusing the quick-meal expectation.",
  "actionable": [
    "Create prep_time bins (e.g., <=15, 16-30, 31-60, >60) and analyze how ratings/n_reviews vary by bin.",
    "Clean and standardize data: convert Carbohydrates, Cholesterol, and Sodium from strings to numeric; handle or impute missing prep_time values.",
    "Prioritize adding and promoting more recipes in the <=30 minute bucket; implement a 'prep_time' filter in the product catalog.",
    "Investigate outliers (>60 minutes) to determine if they reflect data entry errors or legitimate complex recipes; decide on dropping or flagging them for special labeling.",
    "Develop a lightweight dashboard to monitor prep_time distribution and its relationship with ratings over time."
  ]
}



## Task: average_calories_by_rating

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(42)

path = '/sandbox/data/cleaned_salad_data.csv'
df = pd.read_csv(path)

if 'Calories' in df.columns and 'n_star' in df.columns:
    df = df.copy()
    df['n_star'] = pd.to_numeric(df['n_star'], errors='coerce')
    df['Calories'] = pd.to_numeric(df['Calories'], errors='coerce')
    df = df.dropna(subset=['n_star', 'Calories']).copy()

    result = df.pivot_table(index='n_star', values='Calories', aggfunc='mean').reset_index()
    result = result.sort_values('n_star')

    result['n_star'] = result['n_star'].astype(str)

    plt.figure(figsize=(8, 6))
    plt.bar(result['n_star'], result['Calories'])
    plt.xlabel('n_star')
    plt.ylabel('Mean Calories')
    plt.title('Average Calories by Star Rating')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('average_calories_by_rating.png')
else:
    plt.figure()
    plt.savefig('average_calories_by_rating.png')

### Plot
![](sandbox/output/average_calories_by_rating_a267_average_calories_by_rating.png)

### Insight for average_calories_by_rating

{
  "observation": "Average calories increase with star rating: salads rated 4.0 average around 260 kcal, 4.5 around 390 kcal, and 5.0 around 410 kcal per serving, with some high-calorie outliers among 5.0-rated recipes.",
  "insight": "Higher-rated salads tend to be more caloric, suggesting that indulgent or richer ingredients may drive ratings. This could misalign with health-focused goals. There is also potential data quality and outlier effects (string fields for some numeric attributes and extreme values) that may influence the observed trend.",
  "actionable": [
    "Clean and validate data (convert Carbohydrates, Cholesterol, Sodium to numeric; handle missing Calories or n_star).",
    "Compute the correlation between Calories and n_star and quantify variance/outliers to assess robustness.",
    "Develop and test healthier, high-rated salad options; create a 'low-calorie, high-rating' subset for marketing.",
    "Monitor and report regularly on calories vs. rating to detect shifts and outliers."
  ]
}



## Task: calories_vs_protein_scatter

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

path = '/sandbox/data/cleaned_salad_data.csv'
try:
    df = pd.read_csv(path)
except Exception:
    df = pd.DataFrame()

required_cols = ['Calories', 'Protein']

try:
    if all(col in df.columns for col in required_cols):
        df_sub = df[required_cols].copy()
        for col in required_cols:
            df_sub[col] = pd.to_numeric(df_sub[col], errors='coerce')
        df_sub = df_sub.dropna(subset=required_cols)

        if not df_sub.empty:
            plt.figure(figsize=(6, 4))
            plt.scatter(df_sub['Calories'], df_sub['Protein'], alpha=0.7)
            plt.xlabel('Calories')
            plt.ylabel('Protein')
            plt.title('Calories vs Protein')

            try:
                m, b = np.polyfit(df_sub['Calories'], df_sub['Protein'], 1)
                x_vals = np.array([df_sub['Calories'].min(), df_sub['Calories'].max()])
                y_vals = m * x_vals + b
                plt.plot(x_vals, y_vals, color='red', linestyle='--', label='Trend')
                plt.legend()
            except Exception:
                pass

            plt.tight_layout()
            plt.savefig('calories_vs_protein_scatter.png')
            plt.close()
        else:
            plt.figure()
            plt.text(0.5, 0.5, 'No data after cleaning', ha='center', va='center')
            plt.axis('off')
            plt.savefig('calories_vs_protein_scatter.png')
            plt.close()
    else:
        plt.figure()
        plt.text(0.5, 0.5, 'Required columns not found', ha='center', va='center')
        plt.axis('off')
        plt.savefig('calories_vs_protein_scatter.png')
        plt.close()
except Exception:
    plt.figure()
    plt.text(0.5, 0.5, 'Error during processing', ha='center', va='center')
    plt.axis('off')
    plt.savefig('calories_vs_protein_scatter.png')
    plt.close()

### Plot
![](sandbox/output/calories_vs_protein_scatter_ea64_calories_vs_protein_scatter.png)

### Insight for calories_vs_protein_scatter

{
  "observation": "The Calories vs Protein scatter shows a general positive relationship: salads with higher calories tend to have more protein, but there is substantial dispersion and several outliers (e.g., very high-calorie salads with high or moderate protein). Additionally, some numeric fields (Carbohydrates, Cholesterol, Sodium) are stored as strings, which may affect downstream analysis.",
  "insight": "There is a moderate correlation between calories and protein, indicating protein content often rises with total calories but not consistently. From a business perspective, there is value in promoting high-protein salads that are not disproportionately high in calories (protein density = protein per calorie) to appeal to health-conscious customers.",
  "actionable": [
    "Convert all numeric columns (Calories, Protein, Carbohydrates, Cholesterol, Sodium, etc.) to proper numeric types and handle missing values.",
    "Compute protein-per-calorie (protein density) and identify the top 10 salads with highest density within moderate calorie ranges.",
    "Develop or adjust recipes to increase protein while controlling calories (e.g., lean proteins, legumes, eggs) and reduce high-fat/additional-calorie ingredients.",
    "Add a trend line and report R-squared to quantify the calories-protein relationship for better decision making.",
    "Improve data quality and validation (standardize Carbohydrates, Cholesterol, Sodium as numeric) to enable reliable analytics going forward."
  ]
}



## Task: top_salads_by_reviews

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('/sandbox/data/cleaned_salad_data.csv')

required_cols = ['salad_title', 'n_review']
existing_cols = [c for c in required_cols if c in df.columns]

df_sub = df[existing_cols].copy()

if 'n_review' in df_sub.columns:
    df_sub['n_review'] = pd.to_numeric(df_sub['n_review'], errors='coerce')

if 'salad_title' in df_sub.columns and 'n_review' in df_sub.columns:
    df_sub = df_sub.dropna(subset=['salad_title', 'n_review']).copy()
    if not df_sub.empty:
        df_sub = df_sub.sort_values('n_review', ascending=False).copy()
        top_df = df_sub.head(5).copy()
        top_df['salad_title'] = top_df['salad_title'].astype(str)
        top_df = top_df.reset_index(drop=True).copy()

        plt.figure(figsize=(8, 6))
        plt.bar(top_df['salad_title'], top_df['n_review'], color='steelblue')
        plt.xlabel('Salad Title')
        plt.ylabel('Number of Reviews')
        plt.title('Top 5 Salads by Reviews')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig('top_salads_by_reviews.png')
        plt.close()
else:
    plt.figure()
    plt.savefig('top_salads_by_reviews.png')
    plt.close()

### Plot
![](sandbox/output/top_salads_by_reviews_b8ca_top_salads_by_reviews.png)

### Insight for top_salads_by_reviews

{
  "observation": "The bar chart ranks salads by the number of user reviews. Crunchy Noodle Salad has the most reviews, followed by Tomato Feta Pasta Salad, Black Bean and Corn Salad, Cafe Green Salad, and Spinach Salad with Warm Bacon Dressing as the top 5.",
  "insight": "Higher review counts indicate greater visibility and consumer engagement for these recipes. However, review count alone doesn't reveal quality or healthfulness. The top items tend to be indulgent or complex dishes, suggesting popularity is driven by variety and exposure as much as nutrition. From a business perspective, leveraging these popular items can boost traffic, but ratings and sentiment should also be monitored to ensure quality.",
  "actionable": [
    "Feature the top-5 salads in a 'Most Reviewed' or 'Popular' collection to drive engagement.",
    "Combine review counts with average ratings to assess overall quality (e.g., compute average rating and sentiment per item).",
    "Clean and standardize data types (convert numeric fields stored as strings, handle missing values) to improve downstream analytics.",
    "Run A/B tests to measure impact of promoting top-reviewed salads on user engagement and conversions.",
    "Analyze drivers of popularity (prep_time, calories, ingredients) to inform recipe development and recommendation strategies."
  ]
}

