In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.patches import Patch

In [2]:
class FoodDataManager:
    """Manages food data, properties, and categories"""
    
    def __init__(self, food_data=None):
        """
        Initialize food data manager
        
        Args:
            food_data (list): Optional custom food data, otherwise uses default
        """
        # Define food categories for grouping
        self.food_categories = {
            "meat": ["beef", "pork", "chicken", "lamb"],
            "seafood": ["salmon", "tuna", "shrimp"],
            "dairy": ["milk", "cheese", "yogurt", "butter", "cream", "eggs"],
            "grains": ["rice", "pasta", "bread", "quinoa", "oats"],
            "vegetables": ["lettuce", "spinach", "kale", "carrots", "broccoli", "cauliflower", 
                          "bell peppers", "tomatoes", "cucumber", "zucchini", "eggplant", 
                          "onions", "garlic", "mushrooms", "potatoes", "sweet potatoes"],
            "fruits": ["apples", "bananas", "oranges", "berries", "grapes", "melons", "avocados"],
            "prepared_foods": ["soup", "stew", "casserole", "pizza", "lasagna", "curry"],
            "condiments": ["tomato sauce", "mayonnaise", "salad dressing", "pesto", "hummus"],
            "plant_protein": ["tofu"]
        }
        
        # Create a mapping for quick category lookup
        self.item_to_category = {}
        for category, items in self.food_categories.items():
            for item in items:
                self.item_to_category[item] = category
        
        # Food database with properties
        self.food_database = food_data if food_data else self._get_default_food_data()
    
    def _get_default_food_data(self):
        """Return default food database if none provided"""
        return [
            # Proteins
            {"item": "beef", "typical_purchase_kg": 0.5, "typical_leftover_kg": 0.15, "spoilage_days": 3},
            {"item": "pork", "typical_purchase_kg": 0.5, "typical_leftover_kg": 0.125, "spoilage_days": 3},
            {"item": "chicken", "typical_purchase_kg": 0.75, "typical_leftover_kg": 0.2, "spoilage_days": 2},
            {"item": "lamb", "typical_purchase_kg": 0.4, "typical_leftover_kg": 0.12, "spoilage_days": 3},
            {"item": "salmon", "typical_purchase_kg": 0.3, "typical_leftover_kg": 0.09, "spoilage_days": 1},
            {"item": "tuna", "typical_purchase_kg": 0.2, "typical_leftover_kg": 0.06, "spoilage_days": 1},
            {"item": "shrimp", "typical_purchase_kg": 0.25, "typical_leftover_kg": 0.075, "spoilage_days": 1},
            {"item": "eggs", "typical_purchase_kg": 0.6, "typical_leftover_kg": 0.15, "spoilage_days": 14},
            {"item": "tofu", "typical_purchase_kg": 0.4, "typical_leftover_kg": 0.12, "spoilage_days": 5},
            
            # Dairy
            {"item": "milk", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.25, "spoilage_days": 5},
            {"item": "cheese", "typical_purchase_kg": 0.3, "typical_leftover_kg": 0.08, "spoilage_days": 14},
            {"item": "yogurt", "typical_purchase_kg": 0.5, "typical_leftover_kg": 0.15, "spoilage_days": 7},
            {"item": "butter", "typical_purchase_kg": 0.25, "typical_leftover_kg": 0.05, "spoilage_days": 30},
            {"item": "cream", "typical_purchase_kg": 0.3, "typical_leftover_kg": 0.1, "spoilage_days": 5},
            
            # Grains & Starches
            {"item": "rice", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.3, "spoilage_days": 4},
            {"item": "pasta", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.3, "spoilage_days": 3},
            {"item": "bread", "typical_purchase_kg": 0.8, "typical_leftover_kg": 0.2, "spoilage_days": 5},
            {"item": "potatoes", "typical_purchase_kg": 2.0, "typical_leftover_kg": 0.5, "spoilage_days": 14},
            {"item": "sweet potatoes", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.25, "spoilage_days": 7},
            {"item": "quinoa", "typical_purchase_kg": 0.5, "typical_leftover_kg": 0.15, "spoilage_days": 4},
            {"item": "oats", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.2, "spoilage_days": 30},
            
            # Vegetables
            {"item": "lettuce", "typical_purchase_kg": 0.3, "typical_leftover_kg": 0.1, "spoilage_days": 5},
            {"item": "spinach", "typical_purchase_kg": 0.2, "typical_leftover_kg": 0.07, "spoilage_days": 4},
            {"item": "kale", "typical_purchase_kg": 0.2, "typical_leftover_kg": 0.06, "spoilage_days": 5},
            {"item": "carrots", "typical_purchase_kg": 0.5, "typical_leftover_kg": 0.15, "spoilage_days": 14},
            {"item": "broccoli", "typical_purchase_kg": 0.4, "typical_leftover_kg": 0.12, "spoilage_days": 5},
            {"item": "cauliflower", "typical_purchase_kg": 0.6, "typical_leftover_kg": 0.18, "spoilage_days": 7},
            {"item": "bell peppers", "typical_purchase_kg": 0.4, "typical_leftover_kg": 0.12, "spoilage_days": 7},
            {"item": "tomatoes", "typical_purchase_kg": 0.6, "typical_leftover_kg": 0.15, "spoilage_days": 7},
            {"item": "cucumber", "typical_purchase_kg": 0.3, "typical_leftover_kg": 0.09, "spoilage_days": 7},
            {"item": "zucchini", "typical_purchase_kg": 0.4, "typical_leftover_kg": 0.12, "spoilage_days": 5},
            {"item": "eggplant", "typical_purchase_kg": 0.4, "typical_leftover_kg": 0.12, "spoilage_days": 5},
            {"item": "onions", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.2, "spoilage_days": 30},
            {"item": "garlic", "typical_purchase_kg": 0.2, "typical_leftover_kg": 0.05, "spoilage_days": 60},
            {"item": "mushrooms", "typical_purchase_kg": 0.3, "typical_leftover_kg": 0.1, "spoilage_days": 5},
            
            # Fruits
            {"item": "apples", "typical_purchase_kg": 1.2, "typical_leftover_kg": 0.3, "spoilage_days": 14},
            {"item": "bananas", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.25, "spoilage_days": 5},
            {"item": "oranges", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.25, "spoilage_days": 14},
            {"item": "berries", "typical_purchase_kg": 0.3, "typical_leftover_kg": 0.1, "spoilage_days": 3},
            {"item": "grapes", "typical_purchase_kg": 0.5, "typical_leftover_kg": 0.15, "spoilage_days": 5},
            {"item": "melons", "typical_purchase_kg": 1.5, "typical_leftover_kg": 0.5, "spoilage_days": 7},
            {"item": "avocados", "typical_purchase_kg": 0.4, "typical_leftover_kg": 0.1, "spoilage_days": 5},
            
            # Prepared Foods
            {"item": "soup", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.3, "spoilage_days": 3},
            {"item": "stew", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.3, "spoilage_days": 3},
            {"item": "casserole", "typical_purchase_kg": 1.2, "typical_leftover_kg": 0.4, "spoilage_days": 3},
            {"item": "pizza", "typical_purchase_kg": 0.8, "typical_leftover_kg": 0.3, "spoilage_days": 2},
            {"item": "lasagna", "typical_purchase_kg": 1.0, "typical_leftover_kg": 0.35, "spoilage_days": 3},
            {"item": "curry", "typical_purchase_kg": 0.8, "typical_leftover_kg": 0.25, "spoilage_days": 3},
            
            # Condiments & Sauces
            {"item": "tomato sauce", "typical_purchase_kg": 0.5, "typical_leftover_kg": 0.15, "spoilage_days": 5},
            {"item": "mayonnaise", "typical_purchase_kg": 0.3, "typical_leftover_kg": 0.1, "spoilage_days": 60},
            {"item": "salad dressing", "typical_purchase_kg": 0.3, "typical_leftover_kg": 0.1, "spoilage_days": 30},
            {"item": "pesto", "typical_purchase_kg": 0.2, "typical_leftover_kg": 0.07, "spoilage_days": 7},
            {"item": "hummus", "typical_purchase_kg": 0.3, "typical_leftover_kg": 0.1, "spoilage_days": 7}
        ]
    
    def get_category(self, food_item):
        """Get the category for a food item"""
        for item_category, items in self.food_categories.items():
            if food_item in items:
                return item_category
        return "other"
    
    def get_storage_method(self, food_item):
        """Get recommended storage method for a food item"""
        category = self.get_category(food_item)
        if category in ["meat", "seafood", "dairy", "prepared_foods"]:
            return "refrigerated"
        elif category in ["fruits", "condiments"]:
            return "room_temperature"
        elif category in ["grains"]:
            return "pantry"
        return "refrigerated"
    
    def get_perishability_score(self, food_item):
        """Get perishability score (1-10 scale) for a food item"""
        for item in self.food_database:
            if item["item"] == food_item:
                return 10 - min(item["spoilage_days"], 10)
        return 5  # Default if not found
    
    def get_spoilage_days(self, food_item):
        """Get typical spoilage days for a food item"""
        for item in self.food_database:
            if item["item"] == food_item:
                return item["spoilage_days"]
        return 5  # Default if not found
    
    def get_leftover_rate(self, food_item):
        """Get typical leftover rate for a food item"""
        for item in self.food_database:
            if item["item"] == food_item:
                return item["typical_leftover_kg"] / item["typical_purchase_kg"]
        return 0.25  # Default if not found

In [3]:
class CO2FootprintCalculator:
    """Calculates CO2 footprint and impact for food items"""
    
    def __init__(self, co2_data_file=None):
        """
        Initialize CO2 calculator
        
        Args:
            co2_data_file (str): Path to CO2 data CSV file
        """
        self.co2_data = self.load_co2_data(co2_data_file)
        self.food_manager = None  # Will be set when integrated with FoodWasteAnalyzer
    
    def load_co2_data(self, filepath=None):
        """Load CO2 footprint data from CSV or use default values"""
        if filepath:
            try:
                co2_data = pd.read_csv(filepath)
                # Create dictionary mapping food names to CO2 values
                co2_dict = dict(zip(co2_data['Food_normalized'].str.lower(), co2_data['CO2_kg_per_kg']))
                return co2_dict
            except Exception as e:
                print(f"Error loading CO2 data: {e}")
                # Fall back to default values
        
        # Default CO2 values if file not provided or error occurred
        return {
            'beef': 27.0, 'lamb': 39.2, 'pork': 12.1, 'chicken': 6.9, 'fish': 5.4,
            'cheese': 13.5, 'milk': 3.2, 'yogurt': 2.2, 'eggs': 4.8, 'rice': 2.7,
            'bread': 1.4, 'potatoes': 0.3, 'vegetables': 0.4, 'fruits': 0.7,
            'chocolate': 8.4, 'coffee': 16.5, 'pasta': 1.5, 'tofu': 2.0
        }
    
    def get_co2_footprint(self, food_item, fallback=2.5):
        """Get CO2 footprint per kg for a specific food item"""
        # Try exact match
        if food_item.lower() in self.co2_data:
            return self.co2_data[food_item.lower()]
        
        # Try partial matches
        for food_name in self.co2_data:
            if food_item.lower() in food_name or food_name in food_item.lower():
                return self.co2_data[food_name]
        
        # Fall back to category average if food manager is available
        if self.food_manager:
            category = self.food_manager.get_category(food_item)
            category_values = []
            
            for food_name, value in self.co2_data.items():
                food_category = None
                for cat, items in self.food_manager.food_categories.items():
                    if any(item.lower() in food_name for item in items):
                        food_category = cat
                        break
                
                if food_category == category:
                    category_values.append(value)
            
            if category_values:
                return sum(category_values) / len(category_values)
        
        # Ultimate fallback
        return fallback
    
    def calculate_impact(self, food_item, quantity_kg):
        """Calculate CO2 impact if food item is wasted"""
        # Get CO2 footprint for this food
        co2_per_kg = self.get_co2_footprint(food_item)
        
        # Calculate production emissions
        production_co2 = quantity_kg * co2_per_kg
        
        # Add waste processing emissions
        waste_processing_co2 = quantity_kg * 0.21  # kg CO2e for waste processing
        
        # Total impact
        total_impact = production_co2 + waste_processing_co2
        
        return {
            'food_item': food_item,
            'quantity_kg': quantity_kg,
            'co2_per_kg': co2_per_kg,
            'production_co2': production_co2,
            'waste_processing_co2': waste_processing_co2,
            'total_impact': total_impact
        }

In [4]:
class FoodWasteRiskModel:
    """ML model to predict food waste risk levels"""
    
    def __init__(self, food_manager):
        """
        Initialize the food waste risk prediction model
        
        Args:
            food_manager (FoodDataManager): Food data manager instance
        """
        self.food_manager = food_manager
        self.model = None
        self.feature_columns = None
        self.classes_ = None
    
    def generate_training_data_without_leakage(self, samples_per_item=15):
        """Generate training data without including risk logic in features"""
        data = []
        
        for food_item_data in self.food_manager.food_database:
            item = food_item_data["item"]
            category = self.food_manager.get_category(item)
            base_spoilage = food_item_data["spoilage_days"]
            
            for _ in range(samples_per_item):
                # Add variability
                actual_spoilage = max(1, round(base_spoilage * random.uniform(0.8, 1.2)))
                current_age = random.randint(0, actual_spoilage + 3)
                storage_quality = random.uniform(0.9, 1.1)
                
                # Calculate effective age
                effective_age = current_age / storage_quality
                
                # Define risk category based on rules, but don't include these calculations as features
                remaining_life = max(0, 1 - (effective_age / actual_spoilage))
                
                if remaining_life <= 0.1:
                    risk = "high"
                elif remaining_life <= 0.4:
                    risk = "medium"
                else:
                    risk = "low"
                
                # Add some noise to the risk categories (5% chance of being different)
                if random.random() < 0.05:
                    risk = random.choice(["high", "medium", "low"])
                
                # Features that don't directly encode the target
                item_entry = {
                    "item": item,
                    "category": category,
                    "current_age_days": current_age,
                    "spoilage_days": actual_spoilage,
                    "storage_quality": storage_quality,
                    "quantity_kg": food_item_data["typical_leftover_kg"] * random.uniform(0.8, 1.2),
                    "perishability_score": 10 - min(actual_spoilage, 10),
                    "risk_category": risk
                }
                
                data.append(item_entry)
        
        return pd.DataFrame(data)
    
    def train_model_without_leakage(self, test_size=0.25):
        """Train model avoiding data leakage across food items"""
        # Generate data
        df = self.generate_training_data_without_leakage()
        
        # Get unique food items
        unique_items = df['item'].unique()
        
        # Split FOOD ITEMS (not individual rows) into train and test
        train_items, test_items = train_test_split(unique_items, test_size=test_size, random_state=42)
        
        print(f"Training on {len(train_items)} food items, testing on {len(test_items)} food items")
        print(f"Train items: {', '.join(sorted(train_items[:5]))}, ...")
        print(f"Test items: {', '.join(sorted(test_items[:5]))}, ...")
        
        # Now create train and test datasets based on these splits
        train_df = df[df['item'].isin(train_items)]
        test_df = df[df['item'].isin(test_items)]
        
        print(f"Training dataset: {train_df.shape[0]} examples")
        print(f"Testing dataset: {test_df.shape[0]} examples")
        
        # Define features that don't directly encode the target
        features = [
            'current_age_days',
            'spoilage_days', 
            'quantity_kg',
            'storage_quality',
            'category',
            'perishability_score'
        ]
        
        # Handle categorical features
        train_X = pd.get_dummies(train_df[features], columns=['category'])
        test_X = pd.get_dummies(test_df[features], columns=['category'])
        
        # Ensure test data has same columns as training data
        for col in train_X.columns:
            if col not in test_X.columns:
                test_X[col] = 0
        
        # Match column order
        test_X = test_X[train_X.columns]
        
        # Get target variables
        train_y = train_df['risk_category']
        test_y = test_df['risk_category']
        
        # Train model
        self.model = RandomForestClassifier(
            n_estimators=50,
            max_depth=8,
            min_samples_leaf=5,
            random_state=42
        )
        self.model.fit(train_X, train_y)
        self.feature_columns = train_X.columns
        self.classes_ = self.model.classes_
        
        # Evaluate
        train_accuracy = self.model.score(train_X, train_y)
        test_accuracy = self.model.score(test_X, test_y)
        
        print(f"\nTraining accuracy: {train_accuracy:.2f}")
        print(f"Testing accuracy: {test_accuracy:.2f}")
        print(f"Difference: {train_accuracy - test_accuracy:.2f}")
        
        # Detailed evaluation on test set
        y_pred = self.model.predict(test_X)
        print("\nClassification report on test data:")
        print(classification_report(test_y, y_pred))
        
        # Feature importance
        importances = pd.Series(self.model.feature_importances_, index=train_X.columns)
        print("\nTop 10 feature importances:")
        print(importances.sort_values(ascending=False).head(10))
        
        return self.model
    
    def predict_risk(self, food_item, age_days, quantity_kg, storage_quality=1.0, package_opened=True):
        """Predict risk level for a food item using model without leakage"""
        if self.model is None:
            raise ValueError("Model not trained. Call train_model_without_leakage() first.")
        
        # Get food item properties
        spoilage_days = self.food_manager.get_spoilage_days(food_item)
        category = self.food_manager.get_category(food_item)
        perishability_score = 10 - min(spoilage_days, 10)
        
        # Calculate remaining life (for heuristic comparison only)
        effective_age = age_days / storage_quality
        remaining_life_percent = max(0, 1 - (effective_age / spoilage_days))
        
        # Calculate heuristic risk (for comparison)
        if remaining_life_percent <= 0.1:  # Less than 10% life remaining
            explicit_risk = "high"
        elif remaining_life_percent <= 0.4:  # 10-40% life remaining
            explicit_risk = "medium"
        else:  # More than 40% life remaining
            explicit_risk = "low"
        
        # Prepare input data with non-leaking features
        input_data = {
            'current_age_days': [age_days],
            'spoilage_days': [spoilage_days],
            'quantity_kg': [quantity_kg],
            'storage_quality': [storage_quality],
            'perishability_score': [perishability_score]
        }
        
        # Handle categorical features
        # Create dummy variables for category
        for col in self.feature_columns:
            if col not in input_data:
                input_data[col] = [0]
        
        # Set the appropriate food category to 1
        category_col = f"category_{category}"
        if category_col in self.feature_columns:
            input_data[category_col] = [1]
        
        # Create DataFrame with the same structure as training data
        input_df = pd.DataFrame(input_data)
        input_df = input_df[self.feature_columns]  # Ensure same column order
        
        # Make prediction
        model_risk = self.model.predict(input_df)[0]
        probabilities = self.model.predict_proba(input_df)[0]
        
        # For insights: compare model prediction vs explicit calculation
        if model_risk != explicit_risk:
            print(f"Note: Model predicts {model_risk} while heuristic suggests {explicit_risk}")
            print(f"Item: {food_item}, Age: {age_days}, Spoilage days: {spoilage_days}")
            print(f"Remaining life: {remaining_life_percent:.2%}")
        
        return {
            'risk_level': model_risk,
            'probability': {cls: prob for cls, prob in zip(self.classes_, probabilities)},
            'item': food_item,
            'age_days': age_days,
            'spoilage_days': spoilage_days,
            'remaining_life_percent': remaining_life_percent,
            'heuristic_risk': explicit_risk
        }
    
    def visualize_decision_boundaries(self):
        """Visualize the decision boundaries of the model"""
        if self.model is None:
            raise ValueError("Model not trained. Call train_model() first.")
        
        # Generate a small dataset for visualization
        df = self.generate_training_data(samples_per_item=5)
        
        # Create a figure
        plt.figure(figsize=(12, 8))
        
        # Plot the data points
        colors = {'high': 'red', 'medium': 'orange', 'low': 'green'}
        
        # Plot remaining_life_percent vs. age_to_spoilage_ratio
        plt.subplot(1, 2, 1)
        for risk in colors:
            mask = df['risk_category'] == risk
            plt.scatter(
                df.loc[mask, 'remaining_life_percent'], 
                df.loc[mask, 'age_to_spoilage_ratio'],
                c=colors[risk], 
                label=risk,
                alpha=0.7
            )
        
        plt.axvline(x=0.1, color='gray', linestyle='--', alpha=0.7, label='High/Medium boundary')
        plt.axvline(x=0.4, color='gray', linestyle='--', alpha=0.7, label='Medium/Low boundary')
        
        plt.xlabel('Remaining Life Percent')
        plt.ylabel('Age to Spoilage Ratio')
        plt.title('Risk Classification by Key Features')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Plot days_until_spoilage vs. current_age_days
        plt.subplot(1, 2, 2)
        for risk in colors:
            mask = df['risk_category'] == risk
            plt.scatter(
                df.loc[mask, 'days_until_spoilage'], 
                df.loc[mask, 'current_age_days'],
                c=colors[risk], 
                label=risk,
                alpha=0.7
            )
        
        plt.xlabel('Days Until Spoilage')
        plt.ylabel('Current Age (days)')
        plt.title('Risk Classification by Time Features')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        return plt.gcf()
    
    def feature_importance_plot(self):
        """Create a feature importance plot"""
        if self.model is None:
            raise ValueError("Model not trained. Call train_model() first.")
        
        # Get feature importances
        importances = pd.Series(
            self.model.feature_importances_, 
            index=self.feature_columns
        ).sort_values(ascending=False)
        
        # Create plot
        plt.figure(figsize=(10, 6))
        importances.head(10).plot(kind='bar')
        plt.title('Top 10 Feature Importances')
        plt.ylabel('Importance Score')
        plt.tight_layout()
        
        return plt.gcf()

In [5]:
class RecommendationEngine:
    """Generates recommendations based on food waste analysis"""
    
    def __init__(self, food_manager):
        """
        Initialize recommendation engine
        
        Args:
            food_manager (FoodDataManager): Food data manager instance
        """
        self.food_manager = food_manager
    
    def generate_recommendations(self, analysis_results):
        """Generate recommendations based on food waste risk analysis"""
        items = analysis_results['items']
        summary = analysis_results['summary']
        
        recommendations = {
            'priority_actions': [],
            'meal_suggestions': [],
            'storage_tips': [],
            'impact_summary': {}
        }
        
        # Priority actions for high-risk items
        high_risk_items = [item for item in items if item['risk_level'] == 'high']
        if high_risk_items:
            recommendations['priority_actions'].append(
                f"Use these high-risk items first: {', '.join([item['item'] for item in high_risk_items])}"
            )
            
            # Potential CO2 savings
            # Check if co2_impact_if_wasted exists, otherwise use a default calculation
            if 'co2_impact_if_wasted' in high_risk_items[0]:
                high_risk_co2 = sum(item['co2_impact_if_wasted'] for item in high_risk_items)
            else:
                # Fallback calculation if CO2 data not directly available
                high_risk_co2 = 0
                for item in high_risk_items:
                    # Estimate CO2 impact based on item properties
                    co2_per_kg = self._estimate_co2_per_kg(item['item'])
                    quantity_kg = item.get('quantity_kg', 0.2)  # Default if not available
                    high_risk_co2 += co2_per_kg * quantity_kg
            
            recommendations['impact_summary']['high_risk_co2_savings'] = high_risk_co2
            recommendations['impact_summary']['equivalent_car_km'] = high_risk_co2 * 6
            
            # Find compatible items for a meal
            food_groups = set(self.food_manager.get_category(item['item']) for item in high_risk_items)
            if 'vegetables' in food_groups and ('meat' in food_groups or 'plant_protein' in food_groups):
                recommendations['meal_suggestions'].append(
                    "You could make a stir-fry with your high-risk vegetables and protein"
                )
            elif 'vegetables' in food_groups and 'grains' in food_groups:
                recommendations['meal_suggestions'].append(
                    "Consider making a grain bowl with your vegetables and grains"
                )
            
            # Add recommendations based on remaining life percentage
            for item in high_risk_items:
                if 'remaining_life_percent' in item and item['remaining_life_percent'] < 0.05:
                    recommendations['priority_actions'].append(
                        f"Use {item['item']} TODAY - less than 5% of shelf life remains!"
                    )
        
        # Storage tips for medium-risk items
        medium_risk_items = [item for item in items if item['risk_level'] == 'medium']
        if medium_risk_items:
            for item in medium_risk_items:
                category = self.food_manager.get_category(item['item'])
                if category == 'vegetables':
                    recommendations['storage_tips'].append(
                        f"Store {item['item']} in a humid drawer in your refrigerator to extend freshness"
                    )
                elif category == 'fruits':
                    recommendations['storage_tips'].append(
                        f"Some fruits like {item['item']} last longer when stored outside the refrigerator"
                    )
                elif category == 'bread':
                    recommendations['storage_tips'].append(
                        f"Freeze part of your {item['item']} to prevent it from going stale"
                    )
        
        # Overall impact summary
        if 'total_co2_impact_kg' in summary:
            recommendations['impact_summary']['total_potential_co2_savings'] = summary['total_co2_impact_kg']
        else:
            # Calculate from items if not provided in summary
            total_co2 = sum(self._estimate_co2_per_kg(item['item']) * item.get('quantity_kg', 0.2) 
                           for item in items)
            recommendations['impact_summary']['total_potential_co2_savings'] = total_co2
            
        if 'equivalent_tree_days' in summary:
            recommendations['impact_summary']['tree_equivalent_days'] = summary['equivalent_tree_days']
        else:
            # Calculate if not provided
            total_co2 = recommendations['impact_summary']['total_potential_co2_savings']
            recommendations['impact_summary']['tree_equivalent_days'] = total_co2 * 50
        
        return recommendations
    
    def _estimate_co2_per_kg(self, food_item):
        """Estimate CO2 per kg for a food item if not directly available"""
        # Default values for common food categories
        category = self.food_manager.get_category(food_item)
        default_values = {
            'meat': 20.0,
            'seafood': 10.0,
            'dairy': 10.0,
            'vegetables': 2.0,
            'fruits': 1.5,
            'grains': 2.5,
            'prepared_foods': 5.0,
            'condiments': 2.0,
            'plant_protein': 3.0
        }
        
        return default_values.get(category, 3.0)  # Default if category not found

In [8]:
class FoodWasteAnalyzer:
    """Main class for analyzing food waste and generating recommendations"""
    
    def __init__(self, co2_data_file=None):
        """
        Initialize the food waste analyzer
        
        Args:
            co2_data_file (str): Path to CO2 data CSV file
        """
        # Initialize components
        self.food_manager = FoodDataManager()
        self.co2_calculator = CO2FootprintCalculator(co2_data_file)
        self.risk_model = FoodWasteRiskModel(self.food_manager)
        self.recommendation_engine = RecommendationEngine(self.food_manager)
        
        # Link components
        self.co2_calculator.food_manager = self.food_manager
    
    def train_model(self, use_leakage_prevention=True):
        """
        Train the risk prediction model
        
        Args:
            use_leakage_prevention (bool): Whether to use methods that prevent data leakage
        """
        if use_leakage_prevention:
            return self.risk_model.train_model_without_leakage()
        else:
            return self.risk_model.train_model()
    
    def analyze_leftovers(self, leftover_items, quantities_kg, ages_days):
        """
        Analyze leftover food for risk and CO2 impact
        
        Args:
            leftover_items: List of food items
            quantities_kg: List of quantities in kg
            ages_days: List of ages in days
            
        Returns:
            Dictionary with risk predictions and CO2 impact analysis
        """
        if self.risk_model.model is None:
            print("Training model first...")
            self.train_model(use_leakage_prevention=True)
        
        results = []
        total_co2_impact = 0
        high_risk_co2 = 0
        
        # Process each food item
        for food_item, quantity_kg, age_days in zip(leftover_items, quantities_kg, ages_days):
            # Get risk prediction
            risk_prediction = self.risk_model.predict_risk(
                food_item, age_days, quantity_kg
            )
            
            # Calculate CO2 impact
            co2_impact = self.co2_calculator.calculate_impact(food_item, quantity_kg)
            total_impact = co2_impact['total_impact']
            
            # Add to total
            total_co2_impact += total_impact
            
            # Track high risk CO2
            if risk_prediction['risk_level'] == 'high':
                high_risk_co2 += total_impact
            
            # Store result for this item
            item_result = {
                'item': food_item,
                'quantity_kg': quantity_kg,
                'age_days': age_days,
                'risk_level': risk_prediction['risk_level'],
                'risk_probabilities': risk_prediction['probability'],
                'co2_per_kg': co2_impact['co2_per_kg'],
                'co2_impact_if_wasted': total_impact,
                'co2_impact_percentage': 0  # Will calculate after summing
            }
            
            # Include additional information from risk prediction if available
            if 'remaining_life_percent' in risk_prediction:
                item_result['remaining_life_percent'] = risk_prediction['remaining_life_percent']
            if 'spoilage_days' in risk_prediction:
                item_result['spoilage_days'] = risk_prediction['spoilage_days']
            if 'heuristic_risk' in risk_prediction:
                item_result['heuristic_risk'] = risk_prediction['heuristic_risk']
            
            results.append(item_result)
        
        # Calculate percentage of total for each item
        for result in results:
            if total_co2_impact > 0:
                result['co2_impact_percentage'] = (result['co2_impact_if_wasted'] / total_co2_impact) * 100
        
        # Sort by risk level (high → medium → low) and then by CO2 impact
        results.sort(key=lambda x: (
            0 if x['risk_level'] == 'high' else 1 if x['risk_level'] == 'medium' else 2,
            -x['co2_impact_if_wasted']
        ))
        
        # Summary statistics
        summary = {
            'total_items': len(leftover_items),
            'total_co2_impact_kg': total_co2_impact,
            'high_risk_co2_impact_kg': high_risk_co2,
            'high_risk_percentage': (high_risk_co2 / total_co2_impact * 100) if total_co2_impact > 0 else 0,
            'equivalent_car_km': total_co2_impact * 6,  # Rough conversion: 1kg CO2e ≈ 6km in avg car
            'equivalent_tree_days': total_co2_impact * 50,  # ~50 days of a tree absorbing CO2
            'risk_distribution': {
                'high': sum(1 for r in results if r['risk_level'] == 'high'),
                'medium': sum(1 for r in results if r['risk_level'] == 'medium'),
                'low': sum(1 for r in results if r['risk_level'] == 'low')
            }
        }
        
        return {
            'items': results,
            'summary': summary
        }
    
    def generate_recommendations(self, analysis_results):
        """Generate recommendations based on food waste risk analysis"""
        return self.recommendation_engine.generate_recommendations(analysis_results)
    
    def demonstrate_analysis(self, leftover_items, quantities_kg, ages_days):
        """Demonstrate the food waste analysis system with the provided data"""
        # Analyze the leftovers
        print("\n===== FOOD WASTE RISK & CO2 IMPACT ANALYSIS =====")
        analysis = self.analyze_leftovers(leftover_items, quantities_kg, ages_days)
        
        # Generate recommendations
        recommendations = self.generate_recommendations(analysis)
        
        # Print results
        print(f"Total items analyzed: {len(leftover_items)}")
        print(f"Total potential CO2 impact: {analysis['summary']['total_co2_impact_kg']:.2f} kg CO2e")
        print(f"Equivalent to driving: {analysis['summary']['equivalent_car_km']:.1f} km")
        
        print("\nRisk distribution:")
        for risk, count in analysis['summary']['risk_distribution'].items():
            print(f"  - {risk}: {count} items")
        
        print("\nITEM BREAKDOWN:")
        print("{:<10} {:<10} {:<8} {:<10} {:<10}".format("Item", "Risk", "Age(days)", "Qty(kg)", "CO2 Impact"))
        print("-" * 60)
        for item in analysis['items']:
            # Include remaining life percentage if available
            if 'remaining_life_percent' in item:
                risk_info = f"{item['risk_level']} ({item['remaining_life_percent']:.0%})"
            else:
                risk_info = item['risk_level']
                
            print("{:<10} {:<12} {:<8} {:<10.2f} {:<10.2f}".format(
                item['item'][:10], 
                risk_info,
                item['age_days'],
                item['quantity_kg'],
                item['co2_impact_if_wasted']
            ))
        
        print("\nRECOMMENDATIONS:")
        if recommendations['priority_actions']:
            print("\nPriority Actions:")
            for action in recommendations['priority_actions']:
                print(f"  - {action}")
        
        if recommendations['meal_suggestions']:
            print("\nMeal Suggestions:")
            for suggestion in recommendations['meal_suggestions']:
                print(f"  - {suggestion}")
        
        if recommendations['storage_tips']:
            print("\nStorage Tips:")
            for tip in recommendations['storage_tips'][:2]:  # Limit to 2 tips for brevity
                print(f"  - {tip}")
        
        print("\nIMPACT SUMMARY:")
        impact = recommendations['impact_summary']
        print(f"By using your high-risk items, you could save {impact.get('high_risk_co2_savings', 0):.2f} kg CO2e")
        print(f"That's equivalent to driving {impact.get('equivalent_car_km', 0):.1f} km")
        
        return analysis, recommendations

In [11]:
# Initialize the analyzer
analyzer = FoodWasteAnalyzer('co2_footprint_cleaned.csv')

# Train the model with leakage prevention
analyzer.train_model(use_leakage_prevention=True)

# Sample leftover food in a household
leftover_items = ["chicken", "broccoli", "rice", "milk", "bread", "bananas", "cheese"]
quantities_kg = [0.3, 0.2, 0.5, 0.7, 0.4, 0.3, 0.15]
ages_days = [2, 3, 3, 4, 5, 4, 10]

# Now run the analysis
analyzer.demonstrate_analysis(leftover_items, quantities_kg, ages_days)

Training on 39 food items, testing on 14 food items
Train items: carrots, casserole, mushrooms, oranges, salmon, ...
Test items: avocados, butter, curry, quinoa, stew, ...
Training dataset: 585 examples
Testing dataset: 210 examples

Training accuracy: 0.88
Testing accuracy: 0.80
Difference: 0.08

Classification report on test data:
              precision    recall  f1-score   support

        high       0.78      0.93      0.85        84
         low       0.85      0.88      0.87        86
      medium       0.67      0.35      0.46        40

    accuracy                           0.80       210
   macro avg       0.77      0.72      0.73       210
weighted avg       0.79      0.80      0.78       210


Top 10 feature importances:
current_age_days           0.526085
perishability_score        0.129253
spoilage_days              0.127483
quantity_kg                0.078765
storage_quality            0.071256
category_seafood           0.012045
category_meat              0.010322
cat

({'items': [{'item': 'chicken',
    'quantity_kg': 0.3,
    'age_days': 2,
    'risk_level': 'high',
    'risk_probabilities': {'high': 0.5501413320082049,
     'low': 0.2920171130039551,
     'medium': 0.1578415549878398},
    'co2_per_kg': 4.702443553707975,
    'co2_impact_if_wasted': 1.4737330661123924,
    'co2_impact_percentage': 22.98247239566534,
    'remaining_life_percent': 0,
    'spoilage_days': 2,
    'heuristic_risk': 'high'},
   {'item': 'milk',
    'quantity_kg': 0.7,
    'age_days': 4,
    'risk_level': 'high',
    'risk_probabilities': {'high': 0.5914333917660332,
     'low': 0.16734715265277983,
     'medium': 0.24121945558118693},
    'co2_per_kg': 1.764553187302642,
    'co2_impact_if_wasted': 1.3821872311118495,
    'co2_impact_percentage': 21.554839621306694,
    'remaining_life_percent': 0.19999999999999996,
    'spoilage_days': 5,
    'heuristic_risk': 'medium'},
   {'item': 'rice',
    'quantity_kg': 0.5,
    'age_days': 3,
    'risk_level': 'high',
    'risk_