In [1]:
print("=== FEATURE ENGINEERING SUMMARY ===\n")
print(f"Original features: {df.shape[1]}")
print(f"Engineered features: {X_scaled.shape[1]}")
print(f"Feature increase: {X_scaled.shape[1] - df.shape[1]} new features\n")

print("Feature Categories:")
print(f"  - Spatial features: 11 (distance, coordinates, bins, boundaries)")
print(f"  - Interaction features: 9 (climate-vegetation, urban, biodiversity)")
print(f"  - Ecological domain features: 5 (biodiversity, threat, conservation indices)")
print(f"  - Statistical features: 13+ (polynomials, ratios, percentiles)")
print(f"  - Original features: {len(df.columns)}\n")

print("Feature Engineering Techniques Applied:")
print("  ✓ Spatial transformations (distance, bins, boundary distance)")
print("  ✓ Interaction terms (climate×vegetation, urban×climate)")
print("  ✓ Domain-specific indices (biodiversity, threat, conservation)")
print("  ✓ Polynomial features (2nd order)")
print("  ✓ Ratio and composite features")
print("  ✓ Standardization (zero mean, unit variance)")
print("  ✓ Feature importance ranking (correlation, MI, RF)")
print("  ✓ Dimensionality reduction (PCA)\n")

print(f"Next steps for modeling:")
print(f"  1. Use all {X_scaled.shape[1]} features for baseline models")
print(f"  2. Try top {n_components_95} PCA components for comparison")
print(f"  3. Use top 15-20 features (by RF importance) for interpretability")
print(f"  4. Ensemble different feature sets for robustness")

=== FEATURE ENGINEERING SUMMARY ===



NameError: name 'df' is not defined

## 9. Final Feature Set Summary

In [None]:
# Apply PCA to capture variance
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Calculate cumulative variance explained
cumsum_var = np.cumsum(pca.explained_variance_ratio_)

# Plot variance explained
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Individual variance
axes[0].bar(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_)
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Variance Explained Ratio')
axes[0].set_title('Variance Explained by Each Component')
axes[0].set_yscale('log')

# Cumulative variance
axes[1].plot(range(1, len(cumsum_var) + 1), cumsum_var, 'bo-')
axes[1].axhline(y=0.95, color='r', linestyle='--', label='95% variance')
axes[1].axhline(y=0.90, color='orange', linestyle='--', label='90% variance')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Variance Explained')
axes[1].set_title('Cumulative Variance Explained')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find components for 95% variance
n_components_95 = np.argmax(cumsum_var >= 0.95) + 1
n_components_90 = np.argmax(cumsum_var >= 0.90) + 1

print(f"Components for 90% variance: {n_components_90}")
print(f"Components for 95% variance: {n_components_95}")
print(f"Original features: {X_scaled.shape[1]}")
print(f"Dimensionality reduction: {n_components_95}/{X_scaled.shape[1]} = {n_components_95/X_scaled.shape[1]:.1%}")

## 8. Dimensionality Reduction with PCA

In [None]:
# Visualize feature importance
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Correlation
top_corr = correlations.head(10)
axes[0].barh(top_corr['feature'], top_corr['abs_correlation'])
axes[0].set_xlabel('Absolute Correlation')
axes[0].set_title('Top 10 Features by Correlation')
axes[0].invert_yaxis()

# Mutual Information
top_mi = mi_scores.head(10)
axes[1].barh(top_mi['feature'], top_mi['mutual_info'])
axes[1].set_xlabel('Mutual Information')
axes[1].set_title('Top 10 Features by Mutual Information')
axes[1].invert_yaxis()

# Random Forest
top_rf = rf_importance.head(10)
axes[2].barh(top_rf['feature'], top_rf['importance'])
axes[2].set_xlabel('Importance Score')
axes[2].set_title('Top 10 Features by Random Forest')
axes[2].invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# Calculate feature importance using multiple methods
from sklearn.ensemble import RandomForestRegressor

# 1. Correlation with target
correlations = pd.DataFrame({
    'feature': X_scaled.columns,
    'correlation': [X_scaled[col].corr(y) for col in X_scaled.columns]
})
correlations['abs_correlation'] = correlations['correlation'].abs()
correlations = correlations.sort_values('abs_correlation', ascending=False)

# 2. Mutual information
selector = SelectKBest(mutual_info_regression, k=min(20, X_scaled.shape[1]))
selector.fit(X_scaled, y)
mi_scores = pd.DataFrame({
    'feature': X_scaled.columns,
    'mutual_info': selector.scores_
}).sort_values('mutual_info', ascending=False)

# 3. Random Forest feature importance
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_scaled, y)
rf_importance = pd.DataFrame({
    'feature': X_scaled.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Top features
print("=== TOP 15 FEATURES BY CORRELATION ===")
print(correlations.head(15).to_string(index=False))

print("\n=== TOP 15 FEATURES BY MUTUAL INFORMATION ===")
print(mi_scores.head(15).to_string(index=False))

print("\n=== TOP 15 FEATURES BY RANDOM FOREST ===")
print(rf_importance.head(15).to_string(index=False))

## 7. Feature Selection

In [None]:
# Separate features, categorical, and target
feature_cols = [col for col in df_feat.columns if col not in ['risk_score', 'risk_category']]
X = df_feat[feature_cols].copy()
y = df_feat['risk_score'].copy()

# Handle any NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan).fillna(0)

# Scale features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Display scaling statistics
print("Feature Scaling Summary:")
print(f"  Original feature range: [{X.values.min():.2f}, {X.values.max():.2f}]")
print(f"  Scaled feature range: [{X_scaled.values.min():.2f}, {X_scaled.values.max():.2f}]")
print(f"  Features mean: {X_scaled.values.mean():.6f}")
print(f"  Features std: {X_scaled.values.std():.6f}")

## 6. Feature Scaling

In [None]:
# Polynomial features for key variables (2nd order)
poly = PolynomialFeatures(degree=2, include_bias=False)
climate_cols = ['temperature', 'precipitation', 'humidity']
poly_features = poly.fit_transform(df_feat[climate_cols])
poly_feature_names = poly.get_feature_names_out(climate_cols)

for name, feature in zip(poly_feature_names[len(climate_cols):], poly_features[:, len(climate_cols):]):
    if name not in df_feat.columns:
        df_feat[name] = feature

# Ratio features
df_feat['endemic_to_total_species'] = df_feat['endemic_species'] / (df_feat['species_count'] + 1)
df_feat['threatened_to_total_species'] = df_feat['threatened_species'] / (df_feat['species_count'] + 1)
df_feat['temp_to_precip_ratio'] = df_feat['temperature'] / (df_feat['precipitation'] + 1)
df_feat['humidity_to_precip_ratio'] = df_feat['humidity'] / (df_feat['precipitation'] + 1)

# Normalization-based features
df_feat['temp_percentile'] = df_feat['temperature'].rank(pct=True)
df_feat['precip_percentile'] = df_feat['precipitation'].rank(pct=True)
df_feat['elevation_percentile'] = df_feat['elevation'].rank(pct=True)

print("✓ Statistical features created")
print(f"  New features: polynomial terms, ratios, percentiles")
print(f"\nTotal features after engineering: {df_feat.shape[1]}")

## 5. Statistical Features

In [None]:
# Biodiversity index
df_feat['biodiversity_index'] = (
    df_feat['species_count'] * 0.5 + 
    df_feat['endemic_species'] * 0.3 + 
    df_feat['threatened_species'] * 0.2
)

# Threat index (higher = more threatened)
df_feat['threat_index'] = (
    (1 - df_feat['forest_cover']) * 0.3 +
    df_feat['urban_area'] * 0.25 +
    df_feat['population_density'] / df_feat['population_density'].max() * 0.25 +
    (df_feat['threatened_species'] / (df_feat['species_count'] + 1)) * 0.2
)

# Environmental stress index
df_feat['temperature_stress'] = np.abs(df_feat['temperature'] - 25) / 10
df_feat['precipitation_anomaly'] = np.abs(df_feat['precipitation'] - df_feat['precipitation'].mean()) / df_feat['precipitation'].std()

# Conservation priority (based on biodiversity and threat)
df_feat['conservation_priority'] = df_feat['biodiversity_index'] * (1 - df_feat['threat_index'])

# Habitat quality indicator
df_feat['habitat_quality'] = (
    df_feat['forest_cover'] * 0.5 +
    (1 - df_feat['urban_area']) * 0.3 +
    (df_feat['water_bodies'] / 0.3) * 0.2  # normalize by typical max
)
df_feat['habitat_quality'] = np.clip(df_feat['habitat_quality'], 0, 1)

print("✓ Ecological domain features created")
print(f"  New features: biodiversity_index, threat_index, temperature_stress, conservation_priority, habitat_quality")

## 4. Ecological Domain Features

In [None]:
# Climate-vegetation interactions
df_feat['temp_forest_interaction'] = df_feat['temperature'] * df_feat['forest_cover']
df_feat['precip_forest_interaction'] = df_feat['precipitation'] * df_feat['forest_cover']
df_feat['humidity_forest_interaction'] = df_feat['humidity'] * df_feat['forest_cover']

# Urban-climate interactions
df_feat['urban_temp_interaction'] = df_feat['urban_area'] * df_feat['temperature']
df_feat['urban_pop_interaction'] = df_feat['urban_area'] * df_feat['population_density']

# Biodiversity-climate interactions
df_feat['species_temp_interaction'] = df_feat['species_count'] * df_feat['temperature']
df_feat['species_precip_interaction'] = df_feat['species_count'] * df_feat['precipitation']

# Land use composition
df_feat['non_forest_cover'] = 1 - df_feat['forest_cover']
df_feat['total_developed'] = df_feat['urban_area'] + df_feat['agricultural_area']
df_feat['natural_area'] = df_feat['forest_cover'] + df_feat['water_bodies']

print("✓ Interaction features created")
print(f"  New features: climate-vegetation, urban-climate, biodiversity-climate interactions")

## 3. Interaction Features

In [None]:
def calculate_distance(lat1, lon1, lat2, lon2):
    """Calculate Haversine distance in km"""
    R = 6371  # Earth's radius in km
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2)**2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Create spatial features
df_feat = df.copy()

# Distance from center point
center_lat = df_feat['latitude'].mean()
center_lon = df_feat['longitude'].mean()

df_feat['distance_from_center'] = df_feat.apply(
    lambda row: calculate_distance(row['latitude'], row['longitude'], center_lat, center_lon),
    axis=1
)

# Coordinate transformations
df_feat['lat_squared'] = df_feat['latitude'] ** 2
df_feat['lon_squared'] = df_feat['longitude'] ** 2
df_feat['lat_lon_product'] = df_feat['latitude'] * df_feat['longitude']

# Spatial bins
df_feat['lat_bin'] = pd.cut(df_feat['latitude'], bins=10, labels=False)
df_feat['lon_bin'] = pd.cut(df_feat['longitude'], bins=10, labels=False)

# Distance to boundaries
lat_min, lat_max = df['latitude'].min(), df['latitude'].max()
lon_min, lon_max = df['longitude'].min(), df['longitude'].max()

df_feat['distance_to_north'] = lat_max - df_feat['latitude']
df_feat['distance_to_south'] = df_feat['latitude'] - lat_min
df_feat['distance_to_east'] = lon_max - df_feat['longitude']
df_feat['distance_to_west'] = df_feat['longitude'] - lon_min

print("✓ Spatial features created")
print(f"  New features: distance_from_center, lat_squared, lon_squared, lat_lon_product, lat_bin, lon_bin, distance_to_*")

## 2. Spatial Feature Engineering

In [None]:
# Generate the same dataset as in data exploration
np.random.seed(42)
n_samples = 1000

lat_range = (15.6, 22.0)
lon_range = (72.6, 80.9)

data = {
    'latitude': np.random.uniform(lat_range[0], lat_range[1], n_samples),
    'longitude': np.random.uniform(lon_range[0], lon_range[1], n_samples),
    'temperature': np.random.normal(25, 5, n_samples),
    'precipitation': np.random.exponential(2, n_samples),
    'humidity': np.random.normal(60, 15, n_samples),
    'wind_speed': np.random.exponential(3, n_samples),
    'forest_cover': np.random.uniform(0, 1, n_samples),
    'agricultural_area': np.random.uniform(0, 1, n_samples),
    'urban_area': np.random.uniform(0, 1, n_samples),
    'water_bodies': np.random.uniform(0, 0.3, n_samples),
    'species_count': np.random.poisson(15, n_samples),
    'endemic_species': np.random.poisson(2, n_samples),
    'threatened_species': np.random.poisson(1, n_samples),
    'elevation': np.random.normal(500, 300, n_samples),
    'population_density': np.random.exponential(100, n_samples)
}

df = pd.DataFrame(data)

# Generate risk score
risk_score = (
    0.3 * (1 - df['forest_cover']) +
    0.2 * df['urban_area'] +
    0.15 * np.abs(df['temperature'] - 25) / 10 +
    0.1 * (1 / (df['species_count'] + 1)) +
    0.1 * df['population_density'] / 1000 +
    0.15 * np.random.normal(0, 0.1, n_samples)
)

df['risk_score'] = np.clip(risk_score, 0, 1)
df['risk_category'] = pd.cut(df['risk_score'], bins=[0, 0.3, 0.6, 1.0], labels=['Low', 'Medium', 'High'])

print(f"Original dataset shape: {df.shape}")
print(f"Original features: {list(df.columns)}")

## 1. Load and Prepare Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

# EcoPredict Feature Engineering

This notebook demonstrates feature engineering techniques to create meaningful predictive features from raw ecological data.