# EcoPredict Data Exploration

This notebook explores the ecological data used in the EcoPredict system, including climate, land use, and species occurrence data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Load and Examine Data

In [None]:
# Generate sample data for exploration
np.random.seed(42)
n_samples = 1000

# Maharashtra bounds
lat_range = (15.6, 22.0)
lon_range = (72.6, 80.9)

data = {
    # Location
    'latitude': np.random.uniform(lat_range[0], lat_range[1], n_samples),
    'longitude': np.random.uniform(lon_range[0], lon_range[1], n_samples),
    
    # Climate variables
    'temperature': np.random.normal(25, 5, n_samples),
    'precipitation': np.random.exponential(2, n_samples),
    'humidity': np.random.normal(60, 15, n_samples),
    'wind_speed': np.random.exponential(3, n_samples),
    
    # Land use
    'forest_cover': np.random.uniform(0, 1, n_samples),
    'agricultural_area': np.random.uniform(0, 1, n_samples),
    'urban_area': np.random.uniform(0, 1, n_samples),
    'water_bodies': np.random.uniform(0, 0.3, n_samples),
    
    # Biodiversity
    'species_count': np.random.poisson(15, n_samples),
    'endemic_species': np.random.poisson(2, n_samples),
    'threatened_species': np.random.poisson(1, n_samples),
    
    # Other factors
    'elevation': np.random.normal(500, 300, n_samples),
    'population_density': np.random.exponential(100, n_samples)
}

df = pd.DataFrame(data)

# Generate risk score based on features
risk_score = (
    0.3 * (1 - df['forest_cover']) +
    0.2 * df['urban_area'] +
    0.15 * np.abs(df['temperature'] - 25) / 10 +
    0.1 * (1 / (df['species_count'] + 1)) +
    0.1 * df['population_density'] / 1000 +
    0.15 * np.random.normal(0, 0.1, n_samples)
)

df['risk_score'] = np.clip(risk_score, 0, 1)
df['risk_category'] = pd.cut(df['risk_score'], bins=[0, 0.3, 0.6, 1.0], labels=['Low', 'Medium', 'High'])

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])

if missing_values.sum() == 0:
    print("No missing values found!")

## 2. Geographic Distribution

In [None]:
# Geographic scatter plot
fig = px.scatter_mapbox(
    df, 
    lat='latitude', 
    lon='longitude',
    color='risk_score',
    size='species_count',
    hover_data=['temperature', 'forest_cover', 'risk_category'],
    color_continuous_scale='RdYlGn_r',
    mapbox_style='open-street-map',
    title='Geographic Distribution of Ecological Risk',
    height=600
)

fig.update_layout(
    mapbox=dict(
        center=dict(lat=df['latitude'].mean(), lon=df['longitude'].mean()),
        zoom=6
    )
)

fig.show()

In [None]:
# Risk category distribution by location
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Latitude distribution by risk
for category in df['risk_category'].unique():
    subset = df[df['risk_category'] == category]
    axes[0].hist(subset['latitude'], alpha=0.7, label=category, bins=20)
axes[0].set_xlabel('Latitude')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Risk Distribution by Latitude')
axes[0].legend()

# Longitude distribution by risk
for category in df['risk_category'].unique():
    subset = df[df['risk_category'] == category]
    axes[1].hist(subset['longitude'], alpha=0.7, label=category, bins=20)
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Risk Distribution by Longitude')
axes[1].legend()

plt.tight_layout()
plt.show()

## 3. Climate Variables Analysis

In [None]:
# Climate variables distribution
climate_vars = ['temperature', 'precipitation', 'humidity', 'wind_speed']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, var in enumerate(climate_vars):
    axes[i].hist(df[var], bins=30, alpha=0.7, color=sns.color_palette()[i])
    axes[i].set_title(f'{var.title()} Distribution')
    axes[i].set_xlabel(var.title())
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Climate vs Risk relationship
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=['Temperature vs Risk', 'Precipitation vs Risk', 
                   'Humidity vs Risk', 'Wind Speed vs Risk']
)

climate_vars = ['temperature', 'precipitation', 'humidity', 'wind_speed']
positions = [(1,1), (1,2), (2,1), (2,2)]

for var, (row, col) in zip(climate_vars, positions):
    fig.add_trace(
        go.Scatter(
            x=df[var], 
            y=df['risk_score'],
            mode='markers',
            name=var,
            opacity=0.6
        ),
        row=row, col=col
    )

fig.update_layout(height=600, title_text="Climate Variables vs Ecological Risk")
fig.show()

## 4. Land Use Analysis

In [None]:
# Land use composition
land_use_vars = ['forest_cover', 'agricultural_area', 'urban_area', 'water_bodies']

# Average land use composition
avg_composition = df[land_use_vars].mean()

fig = go.Figure(data=[go.Pie(
    labels=land_use_vars,
    values=avg_composition.values,
    hole=0.3
)])

fig.update_layout(
    title="Average Land Use Composition",
    annotations=[dict(text='Land Use', x=0.5, y=0.5, font_size=20, showarrow=False)]
)

fig.show()

In [None]:
# Land use vs risk correlation
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, var in enumerate(land_use_vars):
    axes[i].scatter(df[var], df['risk_score'], alpha=0.6, color=sns.color_palette()[i])
    axes[i].set_xlabel(f'{var.replace("_", " ").title()}')
    axes[i].set_ylabel('Risk Score')
    axes[i].set_title(f'{var.replace("_", " ").title()} vs Risk Score')
    
    # Add trend line
    z = np.polyfit(df[var], df['risk_score'], 1)
    p = np.poly1d(z)
    axes[i].plot(df[var], p(df[var]), "r--", alpha=0.8)

plt.tight_layout()
plt.show()

## 5. Biodiversity Analysis

In [None]:
# Species distribution
biodiversity_vars = ['species_count', 'endemic_species', 'threatened_species']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, var in enumerate(biodiversity_vars):
    axes[i].hist(df[var], bins=20, alpha=0.7, color=sns.color_palette()[i])
    axes[i].set_title(f'{var.replace("_", " ").title()} Distribution')
    axes[i].set_xlabel(var.replace("_", " ").title())
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Biodiversity hotspots
df['biodiversity_index'] = (
    df['species_count'] * 0.5 + 
    df['endemic_species'] * 0.3 + 
    df['threatened_species'] * 0.2
)

# Top 10% biodiversity hotspots
hotspot_threshold = df['biodiversity_index'].quantile(0.9)
hotspots = df[df['biodiversity_index'] >= hotspot_threshold]

print(f"Identified {len(hotspots)} biodiversity hotspots (top 10%)")
print(f"Average risk score in hotspots: {hotspots['risk_score'].mean():.3f}")
print(f"Average risk score overall: {df['risk_score'].mean():.3f}")

In [None]:
# Biodiversity vs Risk
fig = px.scatter(
    df, 
    x='species_count', 
    y='risk_score',
    color='risk_category',
    size='biodiversity_index',
    hover_data=['endemic_species', 'threatened_species'],
    title='Species Count vs Ecological Risk',
    color_discrete_map={'Low': 'green', 'Medium': 'orange', 'High': 'red'}
)

fig.show()

## 6. Correlation Analysis

In [None]:
# Correlation matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(
    correlation_matrix, 
    mask=mask,
    annot=True, 
    cmap='RdBu_r', 
    center=0,
    square=True,
    fmt='.2f'
)
plt.title('Variable Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Risk score correlations
risk_correlations = correlation_matrix['risk_score'].abs().sort_values(ascending=False)
risk_correlations = risk_correlations[risk_correlations.index != 'risk_score']

print("Variables most correlated with risk score:")
print(risk_correlations.head(10))

In [None]:
# Plot top correlations with risk
top_correlations = risk_correlations.head(8)

plt.figure(figsize=(10, 6))
bars = plt.bar(range(len(top_correlations)), top_correlations.values)
plt.xticks(range(len(top_correlations)), top_correlations.index, rotation=45, ha='right')
plt.ylabel('Absolute Correlation with Risk Score')
plt.title('Top Variables Correlated with Ecological Risk')

# Color bars based on correlation strength
for i, bar in enumerate(bars):
    if top_correlations.values[i] > 0.5:
        bar.set_color('red')
    elif top_correlations.values[i] > 0.3:
        bar.set_color('orange')
    else:
        bar.set_color('green')

plt.tight_layout()
plt.show()

## 7. Risk Distribution Analysis

In [None]:
# Risk category distribution
risk_counts = df['risk_category'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart
axes[0].pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%', 
           colors=['green', 'orange', 'red'])
axes[0].set_title('Risk Category Distribution')

# Bar chart
bars = axes[1].bar(risk_counts.index, risk_counts.values, 
                  color=['green', 'orange', 'red'])
axes[1].set_title('Risk Category Counts')
axes[1].set_ylabel('Number of Locations')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Risk score distribution
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(df['risk_score'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
plt.axvline(df['risk_score'].mean(), color='red', linestyle='--', 
           label=f'Mean: {df["risk_score"].mean():.3f}')
plt.axvline(df['risk_score'].median(), color='green', linestyle='--', 
           label=f'Median: {df["risk_score"].median():.3f}')
plt.xlabel('Risk Score')
plt.ylabel('Frequency')
plt.title('Risk Score Distribution')
plt.legend()

plt.subplot(1, 2, 2)
df.boxplot(column='risk_score', by='risk_category', ax=plt.gca())
plt.title('Risk Score by Category')
plt.suptitle('')  # Remove automatic title

plt.tight_layout()
plt.show()

## 8. Summary Statistics by Risk Category

In [None]:
# Summary statistics by risk category
summary_stats = df.groupby('risk_category').agg({
    'temperature': ['mean', 'std'],
    'precipitation': ['mean', 'std'],
    'forest_cover': ['mean', 'std'],
    'species_count': ['mean', 'std'],
    'urban_area': ['mean', 'std'],
    'population_density': ['mean', 'std']
}).round(3)

print("Summary Statistics by Risk Category:")
print(summary_stats)

In [None]:
# Key insights
print("=== KEY INSIGHTS ===")
print(f"\n1. Dataset Overview:")
print(f"   - Total locations analyzed: {len(df):,}")
print(f"   - Geographic coverage: {df['latitude'].min():.2f}°N to {df['latitude'].max():.2f}°N")
print(f"   - Longitude range: {df['longitude'].min():.2f}°E to {df['longitude'].max():.2f}°E")

print(f"\n2. Risk Distribution:")
for category in ['Low', 'Medium', 'High']:
    count = len(df[df['risk_category'] == category])
    percentage = (count / len(df)) * 100
    print(f"   - {category} risk: {count:,} locations ({percentage:.1f}%)")

print(f"\n3. Environmental Factors:")
print(f"   - Average temperature: {df['temperature'].mean():.1f}°C (±{df['temperature'].std():.1f})")
print(f"   - Average precipitation: {df['precipitation'].mean():.1f}mm (±{df['precipitation'].std():.1f})")
print(f"   - Average forest cover: {df['forest_cover'].mean():.1%} (±{df['forest_cover'].std():.1%})")

print(f"\n4. Biodiversity:")
print(f"   - Average species count: {df['species_count'].mean():.1f} (±{df['species_count'].std():.1f})")
print(f"   - Total biodiversity hotspots identified: {len(hotspots)}")
print(f"   - Average risk in hotspots: {hotspots['risk_score'].mean():.3f}")

print(f"\n5. Strongest Risk Correlations:")
for var, corr in risk_correlations.head(3).items():
    print(f"   - {var.replace('_', ' ').title()}: {corr:.3f}")

## 9. Data Quality Assessment

In [None]:
# Data quality checks
print("=== DATA QUALITY ASSESSMENT ===")

print(f"\n1. Completeness:")
print(f"   - Total records: {len(df):,}")
print(f"   - Missing values: {df.isnull().sum().sum()}")
print(f"   - Completeness rate: {(1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100:.1f}%")

print(f"\n2. Coordinate Validity:")
valid_lat = ((df['latitude'] >= -90) & (df['latitude'] <= 90)).sum()
valid_lon = ((df['longitude'] >= -180) & (df['longitude'] <= 180)).sum()
print(f"   - Valid latitudes: {valid_lat}/{len(df)} ({valid_lat/len(df)*100:.1f}%)")
print(f"   - Valid longitudes: {valid_lon}/{len(df)} ({valid_lon/len(df)*100:.1f}%)")

print(f"\n3. Value Ranges:")
print(f"   - Temperature range: {df['temperature'].min():.1f}°C to {df['temperature'].max():.1f}°C")
print(f"   - Humidity range: {df['humidity'].min():.1f}% to {df['humidity'].max():.1f}%")
print(f"   - Forest cover range: {df['forest_cover'].min():.1%} to {df['forest_cover'].max():.1%}")

print(f"\n4. Duplicates:")
duplicates = df.duplicated().sum()
print(f"   - Duplicate records: {duplicates} ({duplicates/len(df)*100:.1f}%)")

print(f"\n✅ Data quality appears good for analysis and modeling!")