# AI-Powered Trash Bin Level Prediction System

## Project Overview
This project aims to develop machine learning models to predict when garbage bins will need to be emptied, optimizing waste collection operations and improving urban sanitation management.

## Dataset Features
- **BIN ID**: Unique identifier for each bin
- **Date & Time**: Temporal information
- **Fill Level**: Current fill level in liters
- **Fill Percentage**: Percentage of bin capacity filled
- **Location**: Geographic location
- **Temperature**: Environmental temperature
- **Battery Level**: Sensor battery status
- **Target**: Fill Level Indicator (Above 550L) - Binary classification

## Objectives
1. Develop binary classification models to predict bin fill status
2. Evaluate model performance using multiple metrics
3. Implement route optimization for collection vehicles
4. Generate actionable insights for waste management

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)

# Route optimization
from scipy.spatial.distance import pdist, squareform
from scipy.optimize import minimize

# Utilities
from datetime import datetime, timedelta
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("All libraries imported successfully!")

## 1. Data Loading and Exploration

In [None]:
# Load the dataset
df = pd.read_excel('trash_data.xlsx')
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")

# Display basic information
df.info()

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

print("Missing Values Analysis:")
print(missing_df)

# Visualize missing values
if not missing_df.empty:
    plt.figure(figsize=(10, 6))
    plt.bar(missing_df.index, missing_df['Missing Count'])
    plt.title('Missing Values by Column')
    plt.xlabel('Columns')
    plt.ylabel('Missing Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 2. Data Preprocessing and Feature Engineering

In [None]:
# Clean column names
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('⁰', '')
print("Cleaned column names:")
print(df.columns.tolist())

# Handle missing values
df = df.dropna(subset=['FILL_LEVEL_INDICATORABOVE_550'])  # Remove rows with missing target
df['FILL_LEVELIN_LITRES'].fillna(df['FILL_LEVELIN_LITRES'].median(), inplace=True)
df['FILL_PERCENTAGE'].fillna(df['FILL_PERCENTAGE'].median(), inplace=True)

print(f"Dataset shape after cleaning: {df.shape}")

In [None]:
# Convert data types and extract features
df['Date'] = pd.to_datetime(df['Date'])
df['TIME'] = pd.to_datetime(df['TIME'], format='%H:%M:%S').dt.time

# Extract temporal features
df['day_of_week'] = df['Date'].dt.dayofweek
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['hour'] = pd.to_datetime(df['TIME'], format='%H:%M:%S').dt.hour

# Convert coordinates to numeric
df['LATITUDE'] = pd.to_numeric(df['LATITUDE'], errors='coerce')
df['LONGITUDE'] = pd.to_numeric(df['LONGITUDE'], errors='coerce')
df['TEMPERATURE_IN_C'] = pd.to_numeric(df['TEMPERATURE_IN_C'], errors='coerce')

# Encode categorical variables
le_bin = LabelEncoder()
le_location = LabelEncoder()

df['BIN_ID_encoded'] = le_bin.fit_transform(df['BIN_ID'])
df['LOCATION_encoded'] = le_location.fit_transform(df['LOCATION_'])

print("Feature engineering completed!")
print(f"New features added: day_of_week, month, day, hour, BIN_ID_encoded, LOCATION_encoded")

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Target variable distribution
target_counts = df['FILL_LEVEL_INDICATORABOVE_550'].value_counts()
target_percentage = df['FILL_LEVEL_INDICATORABOVE_550'].value_counts(normalize=True) * 100

print("Target Variable Distribution:")
print(f"Not Full (0): {target_counts[0]} ({target_percentage[0]:.1f}%)")
print(f"Full (1): {target_counts[1]} ({target_percentage[1]:.1f}%)")

# Visualize target distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
target_counts.plot(kind='bar', ax=ax1, color=['skyblue', 'orange'])
ax1.set_title('Bin Fill Level Distribution')
ax1.set_xlabel('Fill Status (0: Not Full, 1: Full)')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)

# Pie chart
ax2.pie(target_counts.values, labels=['Not Full', 'Full'], autopct='%1.1f%%', colors=['skyblue', 'orange'])
ax2.set_title('Bin Fill Level Percentage')

plt.tight_layout()
plt.show()

In [None]:
# Fill level distribution by location
plt.figure(figsize=(12, 6))
location_fill = df.groupby('LOCATION_')['FILL_LEVEL_INDICATORABOVE_550'].mean().sort_values(ascending=False)
location_fill.plot(kind='bar')
plt.title('Average Fill Rate by Location')
plt.xlabel('Location')
plt.ylabel('Average Fill Rate')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Top 5 locations with highest fill rates:")
print(location_fill.head())

In [None]:
# Temporal analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Fill rate by day of week
day_fill = df.groupby('day_of_week')['FILL_LEVEL_INDICATORABOVE_550'].mean()
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[0,0].bar(range(7), day_fill.values)
axes[0,0].set_title('Fill Rate by Day of Week')
axes[0,0].set_xlabel('Day of Week')
axes[0,0].set_ylabel('Average Fill Rate')
axes[0,0].set_xticks(range(7))
axes[0,0].set_xticklabels(day_names)

# Fill rate by hour
hour_fill = df.groupby('hour')['FILL_LEVEL_INDICATORABOVE_550'].mean()
axes[0,1].plot(hour_fill.index, hour_fill.values, marker='o')
axes[0,1].set_title('Fill Rate by Hour of Day')
axes[0,1].set_xlabel('Hour')
axes[0,1].set_ylabel('Average Fill Rate')

# Fill rate by month
month_fill = df.groupby('month')['FILL_LEVEL_INDICATORABOVE_550'].mean()
axes[1,0].bar(month_fill.index, month_fill.values)
axes[1,0].set_title('Fill Rate by Month')
axes[1,0].set_xlabel('Month')
axes[1,0].set_ylabel('Average Fill Rate')

# Temperature vs Fill Rate
temp_bins = pd.cut(df['TEMPERATURE_IN_C'], bins=10)
temp_fill = df.groupby(temp_bins)['FILL_LEVEL_INDICATORABOVE_550'].mean()
axes[1,1].plot(range(len(temp_fill)), temp_fill.values, marker='o')
axes[1,1].set_title('Fill Rate by Temperature')
axes[1,1].set_xlabel('Temperature Bins')
axes[1,1].set_ylabel('Average Fill Rate')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
numeric_columns = ['FILL_LEVELIN_LITRES', 'TOTALLITRES', 'FILL_PERCENTAGE', 
                   'LATITUDE', 'LONGITUDE', 'TEMPERATURE_IN_C', 'BATTERY_LEVEL_',
                   'day_of_week', 'month', 'hour', 'FILL_LEVEL_INDICATORABOVE_550']

correlation_matrix = df[numeric_columns].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Print correlations with target variable
target_correlations = correlation_matrix['FILL_LEVEL_INDICATORABOVE_550'].abs().sort_values(ascending=False)
print("\nCorrelations with Target Variable (absolute values):")
print(target_correlations[1:])  # Exclude self-correlation

## 4. Feature Selection and Model Preparation

In [None]:
# Prepare features for modeling
feature_columns = ['FILL_LEVELIN_LITRES', 'TOTALLITRES', 'FILL_PERCENTAGE',
                   'TEMPERATURE_IN_C', 'BATTERY_LEVEL_', 'day_of_week', 'month', 'hour',
                   'BIN_ID_encoded', 'LOCATION_encoded']

X = df[feature_columns].copy()
y = df['FILL_LEVEL_INDICATORABOVE_550'].copy()

# Handle any remaining missing values
X = X.fillna(X.median())

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"\nFeatures used: {feature_columns}")

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training set class distribution:")
print(y_train.value_counts(normalize=True))
print(f"\nTest set class distribution:")
print(y_test.value_counts(normalize=True))

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"Scaled training set shape: {X_train_scaled.shape}")

## 5. Machine Learning Model Development

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB()
}

print("Models initialized:")
for name in models.keys():
    print(f"- {name}")

In [None]:
# Train and evaluate models
results = {}
predictions = {}
trained_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for models that benefit from it
    if name in ['Logistic Regression', 'SVM', 'KNN', 'Naive Bayes']:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc
    }
    
    predictions[name] = {
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    trained_models[name] = model
    
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")

print("\nAll models trained successfully!")

In [None]:
# Create results comparison table
results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
results_df = results_df.sort_values('F1-Score', ascending=False)

print("Model Performance Comparison:")
print(results_df)

# Visualize model comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']

for i, metric in enumerate(metrics):
    ax = axes[i//3, i%3]
    results_df[metric].plot(kind='bar', ax=ax, color='skyblue')
    ax.set_title(f'{metric} Comparison')
    ax.set_ylabel(metric)
    ax.tick_params(axis='x', rotation=45)
    ax.grid(axis='y', alpha=0.3)

# Remove empty subplot
axes[1, 2].remove()

plt.tight_layout()
plt.show()

# Identify best model
best_model_name = results_df.index[0]
print(f"\nBest performing model: {best_model_name}")
print(f"F1-Score: {results_df.loc[best_model_name, 'F1-Score']:.4f}")

## 6. Detailed Model Evaluation

In [None]:
# Confusion matrices for all models
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for i, (name, pred_data) in enumerate(predictions.items()):
    if i < 7:  # We have 7 models
        cm = confusion_matrix(y_test, pred_data['predictions'])
        sns.heatmap(cm, annot=True, fmt='d', ax=axes[i], cmap='Blues')
        axes[i].set_title(f'{name} - Confusion Matrix')
        axes[i].set_xlabel('Predicted')
        axes[i].set_ylabel('Actual')

# Remove empty subplots
for j in range(7, 9):
    axes[j].remove()

plt.tight_layout()
plt.show()

In [None]:
# ROC curves for all models
plt.figure(figsize=(12, 8))

for name, pred_data in predictions.items():
    fpr, tpr, _ = roc_curve(y_test, pred_data['probabilities'])
    auc_score = roc_auc_score(y_test, pred_data['probabilities'])
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
# Feature importance for tree-based models
tree_models = ['Random Forest', 'Decision Tree', 'Gradient Boosting']

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, model_name in enumerate(tree_models):
    if model_name in trained_models:
        model = trained_models[model_name]
        feature_importance = pd.DataFrame({
            'feature': feature_columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        axes[i].barh(range(len(feature_importance)), feature_importance['importance'])
        axes[i].set_yticks(range(len(feature_importance)))
        axes[i].set_yticklabels(feature_importance['feature'])
        axes[i].set_title(f'{model_name} - Feature Importance')
        axes[i].set_xlabel('Importance')

plt.tight_layout()
plt.show()

# Print feature importance for best model
if best_model_name in tree_models:
    best_model = trained_models[best_model_name]
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nFeature Importance - {best_model_name}:")
    print(feature_importance)

## 7. Route Optimization for Collection Vehicles

In [None]:
# Get bins that need collection (predicted as full)
best_predictions = predictions[best_model_name]['predictions']
test_indices = X_test.index

# Create a dataframe with test predictions
test_results = df.loc[test_indices].copy()
test_results['predicted_full'] = best_predictions

# Filter bins that need collection
bins_to_collect = test_results[test_results['predicted_full'] == 1].copy()

print(f"Total bins in test set: {len(test_results)}")
print(f"Bins predicted to need collection: {len(bins_to_collect)}")
print(f"Collection rate: {len(bins_to_collect)/len(test_results)*100:.1f}%")

# Show sample of bins to collect
if len(bins_to_collect) > 0:
    print("\nSample bins requiring collection:")
    sample_cols = ['BIN_ID', 'LOCATION_', 'FILL_PERCENTAGE', 'LATITUDE', 'LONGITUDE']
    print(bins_to_collect[sample_cols].head(10))

In [None]:
# Route optimization using nearest neighbor heuristic
def calculate_distance(lat1, lon1, lat2, lon2):
    """Calculate Euclidean distance between two points (simplified for demo)"""
    return np.sqrt((lat2 - lat1)**2 + (lon2 - lon1)**2)

def optimize_route(bins_df, start_location=None):
    """Simple nearest neighbor route optimization"""
    if len(bins_df) == 0:
        return [], 0
    
    bins = bins_df.copy()
    route = []
    total_distance = 0
    
    # Start from first bin if no start location specified
    if start_location is None:
        current_idx = bins.index[0]
    else:
        # Find nearest bin to start location
        distances = bins.apply(
            lambda x: calculate_distance(start_location[0], start_location[1], 
                                       x['LATITUDE'], x['LONGITUDE']), axis=1
        )
        current_idx = distances.idxmin()
    
    unvisited = set(bins.index)
    
    while unvisited:
        route.append(current_idx)
        unvisited.remove(current_idx)
        
        if not unvisited:
            break
        
        # Find nearest unvisited bin
        current_lat, current_lon = bins.loc[current_idx, ['LATITUDE', 'LONGITUDE']]
        distances = {}
        
        for idx in unvisited:
            next_lat, next_lon = bins.loc[idx, ['LATITUDE', 'LONGITUDE']]
            distances[idx] = calculate_distance(current_lat, current_lon, next_lat, next_lon)
        
        next_idx = min(distances, key=distances.get)
        total_distance += distances[next_idx]
        current_idx = next_idx
    
    return route, total_distance

# Optimize route for bins that need collection
if len(bins_to_collect) > 0:
    # Filter bins with valid coordinates
    valid_bins = bins_to_collect.dropna(subset=['LATITUDE', 'LONGITUDE'])
    
    if len(valid_bins) > 0:
        print(f"\nOptimizing route for {len(valid_bins)} bins with valid coordinates...")
        
        # Get optimized route
        route_indices, total_distance = optimize_route(valid_bins)
        
        print(f"Total route distance: {total_distance:.4f} units")
        print(f"Average distance per bin: {total_distance/len(route_indices):.4f} units")
        
        # Create route dataframe
        route_df = valid_bins.loc[route_indices].copy()
        route_df['route_order'] = range(1, len(route_df) + 1)
        
        print("\nOptimized collection route (first 10 stops):")
        route_cols = ['route_order', 'BIN_ID', 'LOCATION_', 'FILL_PERCENTAGE', 'LATITUDE', 'LONGITUDE']
        print(route_df[route_cols].head(10))
    else:
        print("No bins with valid coordinates for route optimization")
else:
    print("No bins predicted to need collection")

In [None]:
# Visualize bins and route on map
if len(bins_to_collect) > 0 and len(valid_bins) > 0:
    fig = go.Figure()
    
    # Add all bins (not full)
    normal_bins = test_results[test_results['predicted_full'] == 0]
    normal_bins_valid = normal_bins.dropna(subset=['LATITUDE', 'LONGITUDE'])
    
    if len(normal_bins_valid) > 0:
        fig.add_trace(go.Scatter(
            x=normal_bins_valid['LONGITUDE'],
            y=normal_bins_valid['LATITUDE'],
            mode='markers',
            marker=dict(color='green', size=6),
            name='Normal Bins',
            text=normal_bins_valid['BIN_ID'],
            hovertemplate='Bin: %{text}<br>Fill: %{customdata:.1f}%<extra></extra>',
            customdata=normal_bins_valid['FILL_PERCENTAGE']
        ))
    
    # Add full bins
    fig.add_trace(go.Scatter(
        x=valid_bins['LONGITUDE'],
        y=valid_bins['LATITUDE'],
        mode='markers',
        marker=dict(color='red', size=10),
        name='Bins to Collect',
        text=valid_bins['BIN_ID'],
        hovertemplate='Bin: %{text}<br>Fill: %{customdata:.1f}%<extra></extra>',
        customdata=valid_bins['FILL_PERCENTAGE']
    ))
    
    # Add route lines
    if len(route_indices) > 1:
        route_lats = [valid_bins.loc[idx, 'LATITUDE'] for idx in route_indices]
        route_lons = [valid_bins.loc[idx, 'LONGITUDE'] for idx in route_indices]
        
        fig.add_trace(go.Scatter(
            x=route_lons,
            y=route_lats,
            mode='lines',
            line=dict(color='blue', width=2),
            name='Optimized Route',
            hoverinfo='skip'
        ))
    
    fig.update_layout(
        title='Trash Bin Collection Route Optimization',
        xaxis_title='Longitude',
        yaxis_title='Latitude',
        hovermode='closest',
        width=800,
        height=600
    )
    
    fig.show()
else:
    print("No data available for route visualization")

## 8. Model Deployment and Predictions

In [None]:
# Save the best model and preprocessing objects
import joblib

model_artifacts = {
    'model': trained_models[best_model_name],
    'scaler': scaler,
    'feature_columns': feature_columns,
    'label_encoders': {
        'bin_id': le_bin,
        'location': le_location
    },
    'model_name': best_model_name,
    'performance_metrics': results[best_model_name]
}

# Save model artifacts
joblib.dump(model_artifacts, 'trash_bin_model.pkl')
print(f"Model artifacts saved successfully!")
print(f"Best model: {best_model_name}")
print(f"Model performance: {results[best_model_name]}")

In [None]:
# Create prediction function
def predict_bin_status(bin_data, model_artifacts):
    """
    Predict if a trash bin needs collection
    
    Args:
        bin_data: Dictionary with bin information
        model_artifacts: Loaded model artifacts
    
    Returns:
        Dictionary with prediction and probability
    """
    model = model_artifacts['model']
    scaler = model_artifacts['scaler']
    feature_columns = model_artifacts['feature_columns']
    
    # Create feature vector
    features = []
    for col in feature_columns:
        if col in bin_data:
            features.append(bin_data[col])
        else:
            features.append(0)  # Default value for missing features
    
    features_array = np.array(features).reshape(1, -1)
    
    # Scale features if needed
    model_name = model_artifacts['model_name']
    if model_name in ['Logistic Regression', 'SVM', 'KNN', 'Naive Bayes']:
        features_scaled = scaler.transform(features_array)
        prediction = model.predict(features_scaled)[0]
        probability = model.predict_proba(features_scaled)[0][1]
    else:
        prediction = model.predict(features_array)[0]
        probability = model.predict_proba(features_array)[0][1]
    
    return {
        'needs_collection': bool(prediction),
        'probability': float(probability),
        'model_used': model_name
    }

# Test the prediction function
sample_bin = {
    'FILL_LEVELIN_LITRES': 600,
    'TOTALLITRES': 1000,
    'FILL_PERCENTAGE': 60,
    'TEMPERATURE_IN_C': 25,
    'BATTERY_LEVEL_': 0.8,
    'day_of_week': 1,
    'month': 11,
    'hour': 14,
    'BIN_ID_encoded': 1,
    'LOCATION_encoded': 0
}

prediction_result = predict_bin_status(sample_bin, model_artifacts)
print("\nSample prediction:")
print(f"Bin data: {sample_bin}")
print(f"Prediction: {prediction_result}")

## 9. Business Insights and Recommendations

In [None]:
# Generate business insights
print("=" * 60)
print("BUSINESS INSIGHTS AND RECOMMENDATIONS")
print("=" * 60)

# Model performance insights
print(f"\n1. MODEL PERFORMANCE")
print(f"   - Best performing model: {best_model_name}")
print(f"   - Accuracy: {results[best_model_name]['Accuracy']:.1%}")
print(f"   - Precision: {results[best_model_name]['Precision']:.1%}")
print(f"   - Recall: {results[best_model_name]['Recall']:.1%}")
print(f"   - F1-Score: {results[best_model_name]['F1-Score']:.1%}")

# Data insights
print(f"\n2. DATA INSIGHTS")
print(f"   - Dataset size: {len(df):,} records")
print(f"   - Bin fill rate: {df['FILL_LEVEL_INDICATORABOVE_550'].mean():.1%}")
print(f"   - Number of unique bins: {df['BIN_ID'].nunique()}")
print(f"   - Number of locations: {df['LOCATION_'].nunique()}")
print(f"   - Date range: {df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}")

# Top locations by fill rate
top_locations = df.groupby('LOCATION_')['FILL_LEVEL_INDICATORABOVE_550'].agg(['mean', 'count']).sort_values('mean', ascending=False)
print(f"\n3. HIGH-PRIORITY LOCATIONS (Top 5)")
for i, (location, stats) in enumerate(top_locations.head().iterrows()):
    print(f"   {i+1}. {location}: {stats['mean']:.1%} fill rate ({stats['count']} records)")

# Temporal patterns
day_fill = df.groupby('day_of_week')['FILL_LEVEL_INDICATORABOVE_550'].mean()
peak_day = day_fill.idxmax()
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

hour_fill = df.groupby('hour')['FILL_LEVEL_INDICATORABOVE_550'].mean()
peak_hour = hour_fill.idxmax()

print(f"\n4. TEMPORAL PATTERNS")
print(f"   - Peak fill day: {day_names[peak_day]} ({day_fill[peak_day]:.1%} fill rate)")
print(f"   - Peak fill hour: {peak_hour}:00 ({hour_fill[peak_hour]:.1%} fill rate)")

# Environmental factors
temp_corr = df[['TEMPERATURE_IN_C', 'FILL_LEVEL_INDICATORABOVE_550']].corr().iloc[0,1]
battery_corr = df[['BATTERY_LEVEL_', 'FILL_LEVEL_INDICATORABOVE_550']].corr().iloc[0,1]

print(f"\n5. ENVIRONMENTAL FACTORS")
print(f"   - Temperature correlation with fill level: {temp_corr:.3f}")
print(f"   - Battery level correlation with fill level: {battery_corr:.3f}")

# Collection efficiency
if 'route_df' in locals() and len(route_df) > 0:
    avg_fill = route_df['FILL_PERCENTAGE'].mean()
    print(f"\n6. COLLECTION EFFICIENCY")
    print(f"   - Bins requiring collection: {len(bins_to_collect)}")
    print(f"   - Average fill percentage of flagged bins: {avg_fill:.1f}%")
    print(f"   - Estimated route distance: {total_distance:.4f} units")

print(f"\n7. RECOMMENDATIONS")
print(f"   - Deploy {best_model_name} model for real-time predictions")
print(f"   - Focus collection efforts on high-priority locations")
print(f"   - Schedule collections during peak hours: {peak_hour}:00")
print(f"   - Implement route optimization to reduce travel distance")
print(f"   - Monitor sensor battery levels for maintenance")
print(f"   - Consider temperature effects on waste generation")

print("\n" + "=" * 60)

## 10. Model Summary and Export

In [None]:
# Create comprehensive model summary
model_summary = {
    'project_info': {
        'title': 'AI-Powered Trash Bin Level Prediction System',
        'dataset_size': len(df),
        'features_used': len(feature_columns),
        'date_range': f"{df['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')}"
    },
    'best_model': {
        'name': best_model_name,
        'performance': results[best_model_name],
        'features': feature_columns
    },
    'all_models_performance': results,
    'data_insights': {
        'bin_fill_rate': float(df['FILL_LEVEL_INDICATORABOVE_550'].mean()),
        'unique_bins': int(df['BIN_ID'].nunique()),
        'unique_locations': int(df['LOCATION_'].nunique()),
        'peak_day': int(peak_day),
        'peak_hour': int(peak_hour)
    },
    'route_optimization': {
        'bins_to_collect': len(bins_to_collect) if 'bins_to_collect' in locals() else 0,
        'total_distance': float(total_distance) if 'total_distance' in locals() else 0
    }
}

# Save summary to JSON
import json
with open('model_summary.json', 'w') as f:
    json.dump(model_summary, f, indent=2)

print("Model summary saved to 'model_summary.json'")
print("\nProject completed successfully!")
print("\nDeliverables created:")
print("1. trash_bin_ml_analysis.ipynb - This Jupyter notebook with complete analysis")
print("2. trash_data.xlsx - Dataset used for the project")
print("3. trash_bin_model.pkl - Trained model artifacts")
print("4. model_summary.json - Comprehensive model summary")
print("\nNext steps:")
print("- Create README.md file with project aims and results")
print("- Generate detailed PDF report")
print("- Deploy model for real-time predictions")