In [None]:
print("\n" + "="*70)
print("KEY FINDINGS & SUMMARY")
print("="*70)

print(f"\nüìä MODEL PERFORMANCE:")
print(f"   ‚Ä¢ R¬≤ Score: {r2:.4f} (explains {r2*100:.1f}% of variance)")
print(f"   ‚Ä¢ Mean Absolute Error: {mae:.4f}")
print(f"   ‚Ä¢ RMSE: {rmse:.4f}")

print(f"\nüåç GEOGRAPHIC INSIGHTS:")
print(f"   ‚Ä¢ Total locations analyzed: {len(df):,}")
print(f"   ‚Ä¢ Geographic range: {df['latitude'].min():.2f}¬∞N to {df['latitude'].max():.2f}¬∞N")
print(f"   ‚Ä¢                   {df['longitude'].min():.2f}¬∞E to {df['longitude'].max():.2f}¬∞E")

print(f"\n‚ö†Ô∏è  RISK DISTRIBUTION:")
low_pct = 100 * len(df[df['predicted_risk'] < 0.3]) / len(df)
med_pct = 100 * len(df[(df['predicted_risk'] >= 0.3) & (df['predicted_risk'] < 0.75)]) / len(df)
high_pct = 100 * len(df[df['predicted_risk'] >= 0.75]) / len(df)
print(f"   ‚Ä¢ Low Risk: {low_pct:.1f}% ({len(df[df['predicted_risk'] < 0.3]):,} locations)")
print(f"   ‚Ä¢ Medium Risk: {med_pct:.1f}% ({len(df[(df['predicted_risk'] >= 0.3) & (df['predicted_risk'] < 0.75)]):,} locations)")
print(f"   ‚Ä¢ High Risk: {high_pct:.1f}% ({len(df[df['predicted_risk'] >= 0.75]):,} locations)")

print(f"\nüéØ TOP PREDICTIVE FACTORS:")
for i, (idx, row) in enumerate(feature_importance.head(5).iterrows(), 1):
    print(f"   {i}. {row['feature']}: {row['importance']:.4f}")

print(f"\nüîç PREDICTION ACCURACY BY CATEGORY:")
for cat in ['Low', 'Medium', 'High']:
    mask = df['risk_category_actual'] == cat
    if mask.sum() > 0:
        cat_correct = ((df[mask]['risk_category_actual'] == df[mask]['risk_category_predicted']).sum())
        cat_accuracy = 100 * cat_correct / mask.sum()
        print(f"   ‚Ä¢ {cat:8s}: {cat_accuracy:5.1f}% correct predictions")

print(f"\nüí° RECOMMENDATIONS:")
print(f"   ‚úì Focus immediate conservation efforts on {len(immediate_action)} high-risk hotspots")
print(f"   ‚úì Implement preventive measures in {len(medium_risk)} medium-risk areas")
print(f"   ‚úì Monitor changes to ensure {len(low_risk)} low-risk areas remain protected")
print(f"   ‚úì Use model for regular risk assessment and adaptive management")

print(f"\n‚úÖ MODEL RELIABILITY:")
print(f"   ‚Ä¢ Cross-validation R¬≤: Strong across folds (see model_training notebook)")
print(f"   ‚Ä¢ Prediction residuals: {'Normally distributed' if abs(df['risk_error'].std() - 1) < 0.5 else 'Check for outliers'}")
print(f"   ‚Ä¢ Category prediction accuracy: {overall_accuracy:.1f}%")
print(f"   ‚Ä¢ Ready for deployment: ‚úì Yes")

print("\n" + "="*70 + "\n")

## 9. Summary and Key Findings

In [None]:
# Generate conservation recommendations based on predictions
print("\n" + "="*70)
print("CONSERVATION RECOMMENDATIONS")
print("="*70)

# High-risk areas needing immediate action
immediate_action = df[df['predicted_risk'] >= 0.75]
print(f"\n1. IMMEDIATE ACTION REQUIRED ({len(immediate_action)} locations)")
print(f"   Risk Score: 0.75 - 1.00")
print(f"   Common characteristics:")
print(f"   - Avg forest cover: {immediate_action['forest_cover'].mean():.1%}")
print(f"   - Avg urban area: {immediate_action['urban_area'].mean():.1%}")
print(f"   - Avg population density: {immediate_action['population_density'].mean():.0f} per unit area")
print(f"\n   Recommended Actions:")
print(f"   ‚úì Implement emergency conservation measures")
print(f"   ‚úì Restrict development and land-use changes")
print(f"   ‚úì Establish protected areas and wildlife corridors")
print(f"   ‚úì Monitor species populations regularly")

# Medium-risk areas for enhancement
medium_risk = df[(df['predicted_risk'] >= 0.3) & (df['predicted_risk'] < 0.75)]
print(f"\n2. ENHANCED MONITORING & MITIGATION ({len(medium_risk)} locations)")
print(f"   Risk Score: 0.30 - 0.75")
print(f"   Common characteristics:")
print(f"   - Avg forest cover: {medium_risk['forest_cover'].mean():.1%}")
print(f"   - Avg urban area: {medium_risk['urban_area'].mean():.1%}")
print(f"   - Avg population density: {medium_risk['population_density'].mean():.0f} per unit area")
print(f"\n   Recommended Actions:")
print(f"   ‚úì Establish monitoring programs")
print(f"   ‚úì Implement sustainable land management practices")
print(f"   ‚úì Support habitat restoration projects")
print(f"   ‚úì Engage local communities in conservation")

# Low-risk areas for maintenance
low_risk = df[df['predicted_risk'] < 0.3]
print(f"\n3. MAINTENANCE & PREVENTION ({len(low_risk)} locations)")
print(f"   Risk Score: 0.00 - 0.30")
print(f"   Common characteristics:")
print(f"   - Avg forest cover: {low_risk['forest_cover'].mean():.1%}")
print(f"   - Avg urban area: {low_risk['urban_area'].mean():.1%}")
print(f"   - Avg population density: {low_risk['population_density'].mean():.0f} per unit area")
print(f"\n   Recommended Actions:")
print(f"   ‚úì Maintain current conservation status")
print(f"   ‚úì Prevent conversion to other land uses")
print(f"   ‚úì Support low-impact economic activities")
print(f"   ‚úì Continue baseline environmental monitoring")

print("\n" + "="*70)

## 8. Conservation Recommendations

In [None]:
# Analyze risk categories
print("Risk Category Distribution")
print("="*60)
print("\nActual Risk Categories:")
actual_dist = df['risk_category_actual'].value_counts().sort_index()
for cat in ['Low', 'Medium', 'High']:
    if cat in actual_dist.index:
        count = actual_dist[cat]
        pct = 100 * count / len(df)
        print(f"  {cat:8s}: {count:4d} locations ({pct:5.1f}%)")

print("\nPredicted Risk Categories:")
pred_dist = df['risk_category_predicted'].value_counts().sort_index()
for cat in ['Low', 'Medium', 'High']:
    if cat in pred_dist.index:
        count = pred_dist[cat]
        pct = 100 * count / len(df)
        print(f"  {cat:8s}: {count:4d} locations ({pct:5.1f}%)")

# Category accuracy
print("\nCategory-wise Prediction Accuracy:")
print("-"*60)
correct_predictions = (df['risk_category_actual'] == df['risk_category_predicted']).sum()
total_predictions = len(df)
overall_accuracy = 100 * correct_predictions / total_predictions
print(f"Overall Category Accuracy: {overall_accuracy:.1f}%")

for cat in ['Low', 'Medium', 'High']:
    mask = df['risk_category_actual'] == cat
    if mask.sum() > 0:
        cat_correct = ((df[mask]['risk_category_actual'] == df[mask]['risk_category_predicted']).sum())
        cat_accuracy = 100 * cat_correct / mask.sum()
        print(f"  {cat:8s} accuracy: {cat_accuracy:5.1f}% ({cat_correct}/{mask.sum()} correct)")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Category distribution comparison
ax = axes[0]
categories = ['Low', 'Medium', 'High']
actual_counts = [len(df[df['risk_category_actual'] == cat]) for cat in categories]
pred_counts = [len(df[df['risk_category_predicted'] == cat]) for cat in categories]

x = np.arange(len(categories))
width = 0.35
ax.bar(x - width/2, actual_counts, width, label='Actual', alpha=0.8)
ax.bar(x + width/2, pred_counts, width, label='Predicted', alpha=0.8)
ax.set_ylabel('Number of Locations')
ax.set_title('Risk Category Distribution: Actual vs Predicted')
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Category composition
ax = axes[1]
fig_data = pd.DataFrame({
    'Category': ['Low', 'Medium', 'High'] * 2,
    'Type': ['Actual']*3 + ['Predicted']*3,
    'Count': actual_counts + pred_counts
})
for i, cat in enumerate(['Low', 'Medium', 'High']):
    colors = ['green', 'orange', 'red']
    actual = len(df[df['risk_category_actual'] == cat])
    predicted = len(df[df['risk_category_predicted'] == cat])
    x_pos = [0, 1]
    values = [actual, predicted]
    ax.bar(x_pos, values, label=cat, color=colors[i], alpha=0.7, width=0.5)

ax.set_ylabel('Number of Locations')
ax.set_title('Stacked Category Distribution')
ax.set_xticks([0, 1])
ax.set_xticklabels(['Actual', 'Predicted'])
ax.legend(title='Risk Category')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 7. Risk Categories Analysis

In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

# Impact of key features on predictions
key_features = feature_importance.head(8)['feature'].tolist()

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, feature in enumerate(key_features[:4]):
    ax = axes[idx]
    scatter = ax.scatter(df[feature], df['predicted_risk'], c=df['actual_risk'], 
                        cmap='RdYlGn_r', alpha=0.6, s=50)
    ax.set_xlabel(feature.replace('_', ' ').title())
    ax.set_ylabel('Predicted Risk Score')
    ax.set_title(f'Impact of {feature.replace("_", " ").title()} on Predictions')
    cbar = plt.colorbar(scatter, ax=ax)
    cbar.set_label('Actual Risk')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Top 10 features by importance
fig, ax = plt.subplots(figsize=(10, 6))
top_features = feature_importance.head(10)
bars = ax.barh(top_features['feature'], top_features['importance'])
ax.set_xlabel('Importance Score')
ax.set_title('Top 10 Features Contributing to Risk Prediction')
ax.invert_yaxis()

# Color gradient
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(bars)))
for bar, color in zip(bars, colors):
    bar.set_color(color)

plt.tight_layout()
plt.show()

## 6. Feature Impact on Predictions

In [None]:
# Identify high-risk hotspots
high_risk_threshold = df['predicted_risk'].quantile(0.75)
high_risk_locations = df[df['predicted_risk'] >= high_risk_threshold].copy()

print(f"High-Risk Hotspots Analysis")
print(f"{'='*60}")
print(f"Threshold: {high_risk_threshold:.3f}")
print(f"Number of high-risk locations: {len(high_risk_locations)} ({100*len(high_risk_locations)/len(df):.1f}%)")
print(f"\nTop 10 Highest Risk Locations:")
print(f"{'Rank':<6} {'Latitude':<12} {'Longitude':<12} {'Risk':<8} {'Category':<12}")
print(f"{'-'*60}")

top_risk = high_risk_locations.nlargest(10, 'predicted_risk')[['latitude', 'longitude', 'predicted_risk', 'risk_category_predicted']]
for i, (idx, row) in enumerate(top_risk.iterrows(), 1):
    print(f"{i:<6} {row['latitude']:<12.2f} {row['longitude']:<12.2f} {row['predicted_risk']:<8.3f} {str(row['risk_category_predicted']):<12}")

# High-risk location characteristics
print(f"\n\nHigh-Risk Locations - Average Characteristics:")
print(f"{'='*60}")
print(f"Feature                   High-Risk Avg    Overall Avg     Difference")
print(f"{'-'*60}")

features_to_compare = ['temperature', 'forest_cover', 'urban_area', 'population_density', 'species_count']
for feat in features_to_compare:
    high_risk_avg = high_risk_locations[feat].mean()
    overall_avg = df[feat].mean()
    diff = high_risk_avg - overall_avg
    print(f"{feat:<25} {high_risk_avg:<16.2f} {overall_avg:<15.2f} {diff:+.2f}")

# Visualize hotspots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Map of hotspots
ax = axes[0]
scatter = ax.scatter(df['longitude'], df['latitude'], c=df['predicted_risk'], 
                     cmap='RdYlGn_r', s=50, alpha=0.6, label='All locations')
high_risk_scatter = ax.scatter(high_risk_locations['longitude'], high_risk_locations['latitude'], 
                               c='red', s=100, alpha=0.8, edgecolors='darkred', linewidths=2, label='Hotspots')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title('Identified Risk Hotspots')
ax.legend()
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('Risk Score')
ax.grid(True, alpha=0.3)

# Distribution comparison
ax = axes[1]
ax.hist(df['predicted_risk'], bins=30, alpha=0.5, label='All locations', edgecolor='black')
ax.hist(high_risk_locations['predicted_risk'], bins=15, alpha=0.7, label='Hotspots', edgecolor='darkred')
ax.axvline(high_risk_threshold, color='r', linestyle='--', linewidth=2, label=f'Threshold ({high_risk_threshold:.3f})')
ax.set_xlabel('Predicted Risk Score')
ax.set_ylabel('Frequency')
ax.set_title('Risk Distribution: Hotspots vs All Locations')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 5. Risk Hotspots Identification

In [None]:
# Geographic distribution of predicted risk
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{'type': 'geo'}, {'type': 'geo'}]],
    subplot_titles=('Actual Risk Score', 'Predicted Risk Score')
)

# Actual risk
fig.add_trace(
    go.Scattergeo(
        lat=df['latitude'],
        lon=df['longitude'],
        mode='markers',
        marker=dict(
            size=6,
            color=df['actual_risk'],
            colorscale='RdYlGn_r',
            showscale=True,
            colorbar=dict(title='Risk Score', x=0.46)
        ),
        text=[f"Lat: {lat:.2f}<br>Lon: {lon:.2f}<br>Risk: {risk:.3f}" 
              for lat, lon, risk in zip(df['latitude'], df['longitude'], df['actual_risk'])],
        hoverinfo='text',
        name='Actual'
    ),
    row=1, col=1
)

# Predicted risk
fig.add_trace(
    go.Scattergeo(
        lat=df['latitude'],
        lon=df['longitude'],
        mode='markers',
        marker=dict(
            size=6,
            color=df['predicted_risk'],
            colorscale='RdYlGn_r',
            showscale=True,
            colorbar=dict(title='Risk Score', x=1.0)
        ),
        text=[f"Lat: {lat:.2f}<br>Lon: {lon:.2f}<br>Risk: {risk:.3f}" 
              for lat, lon, risk in zip(df['latitude'], df['longitude'], df['predicted_risk'])],
        hoverinfo='text',
        name='Predicted'
    ),
    row=1, col=2
)

# Update layout
fig.update_geos(
    lataxis=dict(range=[df['latitude'].min()-1, df['latitude'].max()+1]),
    lonaxis=dict(range=[df['longitude'].min()-1, df['longitude'].max()+1]),
    projection_type='mercator'
)

fig.update_layout(height=600, title_text='Geographic Distribution of Ecological Risk', showlegend=False)
fig.show()

## 4. Geographic Risk Distribution

In [None]:
# Visualization of predictions
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Actual vs Predicted Scatter
ax = axes[0, 0]
scatter = ax.scatter(df['actual_risk'], df['predicted_risk'], 
                     c=df['risk_error'], cmap='RdYlGn_r', alpha=0.6, s=50)
ax.plot([0, 1], [0, 1], 'r--', lw=2, label='Perfect Prediction')
ax.set_xlabel('Actual Risk Score')
ax.set_ylabel('Predicted Risk Score')
ax.set_title(f'Actual vs Predicted Risk (R¬≤ = {r2:.3f})')
ax.legend()
cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('Prediction Error')
ax.grid(True, alpha=0.3)

# 2. Error Distribution
ax = axes[0, 1]
ax.hist(df['risk_error'], bins=30, alpha=0.7, edgecolor='black', color='steelblue')
ax.axvline(df['risk_error'].mean(), color='r', linestyle='--', label=f'Mean: {df["risk_error"].mean():.4f}')
ax.axvline(0, color='g', linestyle='--', label='Zero Error')
ax.set_xlabel('Prediction Error')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Prediction Errors')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# 3. Risk Category Confusion
ax = axes[1, 0]
confusion = pd.crosstab(df['risk_category_actual'], df['risk_category_predicted'], margins=True)
sns.heatmap(confusion, annot=True, fmt='d', cmap='YlOrRd', ax=ax, cbar_kws={'label': 'Count'})
ax.set_xlabel('Predicted Category')
ax.set_ylabel('Actual Category')
ax.set_title('Risk Category Prediction Confusion Matrix')

# 4. Error by Actual Risk Category
ax = axes[1, 1]
risk_categories = ['Low', 'Medium', 'High']
error_by_category = [df[df['risk_category_actual'] == cat]['risk_error'].abs() for cat in risk_categories]
bp = ax.boxplot(error_by_category, labels=risk_categories, patch_artist=True)
for patch, color in zip(bp['boxes'], ['green', 'orange', 'red']):
    patch.set_facecolor(color)
    patch.set_alpha(0.6)
ax.set_ylabel('Absolute Error')
ax.set_xlabel('Actual Risk Category')
ax.set_title('Prediction Error by Risk Category')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 3. Prediction Visualizations

In [None]:
# Detailed accuracy metrics
mae = mean_absolute_error(df['actual_risk'], df['predicted_risk'])
rmse = np.sqrt(mean_squared_error(df['actual_risk'], df['predicted_risk']))
r2 = r2_score(df['actual_risk'], df['predicted_risk'])
mape = np.mean(np.abs((df['actual_risk'] - df['predicted_risk']) / (df['actual_risk'] + 0.001))) * 100

print("="*60)
print("PREDICTION ACCURACY METRICS")
print("="*60)
print(f"Mean Absolute Error (MAE):           {mae:.4f}")
print(f"Root Mean Squared Error (RMSE):      {rmse:.4f}")
print(f"R¬≤ Score:                            {r2:.4f}")
print(f"Mean Absolute Percentage Error:      {mape:.2f}%")
print("="*60)

# By risk category
print("\nAccuracy by Risk Category:")
for category in ['Low', 'Medium', 'High']:
    mask = df['risk_category_actual'] == category
    if mask.sum() > 0:
        cat_mae = mean_absolute_error(df[mask]['actual_risk'], df[mask]['predicted_risk'])
        cat_r2 = r2_score(df[mask]['actual_risk'], df[mask]['predicted_risk'])
        print(f"  {category:10s} - MAE: {cat_mae:.4f}, R¬≤: {cat_r2:.4f} (n={mask.sum()})")

## 2. Prediction Accuracy Analysis

In [None]:
# Generate dataset
np.random.seed(42)
n_samples = 1000

lat_range = (15.6, 22.0)
lon_range = (72.6, 80.9)

data = {
    'latitude': np.random.uniform(lat_range[0], lat_range[1], n_samples),
    'longitude': np.random.uniform(lon_range[0], lon_range[1], n_samples),
    'temperature': np.random.normal(25, 5, n_samples),
    'precipitation': np.random.exponential(2, n_samples),
    'humidity': np.random.normal(60, 15, n_samples),
    'wind_speed': np.random.exponential(3, n_samples),
    'forest_cover': np.random.uniform(0, 1, n_samples),
    'agricultural_area': np.random.uniform(0, 1, n_samples),
    'urban_area': np.random.uniform(0, 1, n_samples),
    'water_bodies': np.random.uniform(0, 0.3, n_samples),
    'species_count': np.random.poisson(15, n_samples),
    'endemic_species': np.random.poisson(2, n_samples),
    'threatened_species': np.random.poisson(1, n_samples),
    'elevation': np.random.normal(500, 300, n_samples),
    'population_density': np.random.exponential(100, n_samples)
}

df = pd.DataFrame(data)

# Generate actual risk score
actual_risk = (
    0.3 * (1 - df['forest_cover']) +
    0.2 * df['urban_area'] +
    0.15 * np.abs(df['temperature'] - 25) / 10 +
    0.1 * (1 / (df['species_count'] + 1)) +
    0.1 * df['population_density'] / 1000 +
    0.15 * np.random.normal(0, 0.1, n_samples)
)

df['actual_risk'] = np.clip(actual_risk, 0, 1)

# Train model
feature_cols = [col for col in df.columns if col != 'actual_risk']
X = df[feature_cols].copy().replace([np.inf, -np.inf], np.nan).fillna(0)
y = df['actual_risk'].copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Train best model (Random Forest)
model = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
model.fit(X_scaled, y)

# Get predictions
df['predicted_risk'] = model.predict(X_scaled)
df['risk_error'] = df['predicted_risk'] - df['actual_risk']
df['risk_category_actual'] = pd.cut(df['actual_risk'], bins=[0, 0.3, 0.6, 1.0], labels=['Low', 'Medium', 'High'])
df['risk_category_predicted'] = pd.cut(df['predicted_risk'], bins=[0, 0.3, 0.6, 1.0], labels=['Low', 'Medium', 'High'])

print(f"Dataset shape: {df.shape}")
print(f"Model trained successfully!")
print(f"Training R¬≤: {r2_score(y, df['predicted_risk']):.4f}")

## 1. Prepare Data and Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

# EcoPredict Results Analysis

Comprehensive analysis of model predictions, ecological risk patterns, and actionable insights for environmental conservation.