In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import opendatasets as od

# 2. Setup Directories
if not os.path.exists('../data'):
    os.makedirs('../data')

print("="*70)
print("ROCKFALL RISK PREDICTION - DATA GENERATION & INTEGRATION")
print("="*70)

# ========================================================================
# PART A: Generate Synthetic Rockfall Sensor Data
# ========================================================================
print("\n[PART A] Generating Synthetic Rockfall Sensor Data...")
print("-"*70)

num_samples = 10000  # Increased from 5000 to 10000
np.random.seed(42)

# Base values for stable conditions
synthetic_data = {
    'seismic_activity': np.random.uniform(0.01, 0.2, num_samples),
    'vibration_level': np.random.uniform(0.1, 1.5, num_samples),
    'joint_water_pressure': np.random.uniform(50, 200, num_samples),
    'displacement_mm': np.random.uniform(0.0, 2.0, num_samples),
    'rainfall_mm': np.random.uniform(0, 10, num_samples)
}

df_synthetic = pd.DataFrame(synthetic_data)

# Introduce conditions that lead to higher risk
high_risk_indices = np.random.choice(df_synthetic.index, size=int(num_samples * 0.1), replace=False)
df_synthetic.loc[high_risk_indices, 'seismic_activity'] *= np.random.uniform(3, 8)
df_synthetic.loc[high_risk_indices, 'vibration_level'] *= np.random.uniform(2, 5)
df_synthetic.loc[high_risk_indices, 'joint_water_pressure'] += np.random.uniform(100, 250)
df_synthetic.loc[high_risk_indices, 'displacement_mm'] += np.random.uniform(3, 10)
df_synthetic.loc[high_risk_indices, 'rainfall_mm'] += np.random.uniform(20, 50)

medium_risk_indices = np.random.choice(df_synthetic.drop(high_risk_indices).index, size=int(num_samples * 0.2), replace=False)
df_synthetic.loc[medium_risk_indices, 'seismic_activity'] *= np.random.uniform(1.5, 3)
df_synthetic.loc[medium_risk_indices, 'vibration_level'] *= np.random.uniform(1.5, 3)
df_synthetic.loc[medium_risk_indices, 'joint_water_pressure'] += np.random.uniform(50, 150)
df_synthetic.loc[medium_risk_indices, 'displacement_mm'] += np.random.uniform(1, 4)
df_synthetic.loc[medium_risk_indices, 'rainfall_mm'] += np.random.uniform(10, 30)

# Define risk based on a scoring system
score = (
    df_synthetic['seismic_activity'] * 2.0 + 
    df_synthetic['vibration_level'] * 1.5 + 
    df_synthetic['joint_water_pressure'] * 0.05 + 
    df_synthetic['displacement_mm'] * 1.0 + 
    df_synthetic['rainfall_mm'] * 0.2
)

labels = ['Low', 'Medium', 'High', 'Critical']
df_synthetic['rockfall_risk'] = pd.qcut(score, q=4, labels=labels, duplicates='drop')
df_synthetic.dropna(subset=['rockfall_risk'], inplace=True)
df_synthetic['data_source'] = 'Synthetic'

print(f"✓ Synthetic data generated: {len(df_synthetic)} samples")
print(f"  Features: {list(df_synthetic.columns[:-2])}")

# ========================================================================
# PART B: Download Kaggle Landslide Dataset
# ========================================================================
print("\n[PART B] Downloading Kaggle Landslide Dataset...")
print("-"*70)

dataset_url = 'https://www.kaggle.com/datasets/snehilmathur/landslide-dataset-for-classification'
download_dir = '../data/kaggle_landslide'

print(f"Downloading from Kaggle: {dataset_url}")
print("NOTE: You will be prompted to enter your Kaggle credentials.")
print("If you haven't set up Kaggle API, see README for setup instructions.\n")

# Download dataset from Kaggle
od.download(dataset_url, download_dir)

# Find and load the CSV file
csv_files = [f for f in os.listdir(download_dir) if f.endswith('.csv')]
if not csv_files:
    raise FileNotFoundError(f"No CSV file found in {download_dir}. Please check the download.")

kaggle_path = os.path.join(download_dir, csv_files[0])
df_kaggle = pd.read_csv(kaggle_path)

print(f"\n✓ Kaggle dataset loaded: {len(df_kaggle)} samples")
print(f"  Features: {list(df_kaggle.columns)}")

# ========================================================================
# PART C: Process Kaggle Dataset
# ========================================================================
print("\n[PART C] Processing Kaggle Dataset...")
print("-"*70)

# Standardize risk categories - convert to string first to avoid categorical issues
if 'Landslide_Risk' in df_kaggle.columns:
    df_kaggle['Landslide_Risk'] = df_kaggle['Landslide_Risk'].astype(str)
    risk_mapping = {'Low': 'Low', 'Moderate': 'Medium', 'High': 'High'}
    df_kaggle['rockfall_risk'] = df_kaggle['Landslide_Risk'].map(risk_mapping)
    
    # Add some Critical samples (20% of High becomes Critical)
    high_indices = df_kaggle[df_kaggle['rockfall_risk'] == 'High'].index
    if len(high_indices) > 0:
        critical_indices = np.random.choice(high_indices, size=int(len(high_indices) * 0.2), replace=False)
        df_kaggle.loc[critical_indices, 'rockfall_risk'] = 'Critical'
    
    df_kaggle = df_kaggle.drop('Landslide_Risk', axis=1)

# Handle Soil_Type encoding (one-hot encoding if present)
if 'Soil_Type' in df_kaggle.columns:
    soil_dummies = pd.get_dummies(df_kaggle['Soil_Type'], prefix='soil')
    df_kaggle = pd.concat([df_kaggle, soil_dummies], axis=1)
    df_kaggle = df_kaggle.drop('Soil_Type', axis=1)

df_kaggle['data_source'] = 'Kaggle'

print(f"✓ Risk categories standardized: {df_kaggle['rockfall_risk'].value_counts().to_dict()}")

# ========================================================================
# PART D: Merge Datasets
# ========================================================================
print("\n[PART D] Merging Synthetic + Kaggle Datasets...")
print("-"*70)

# Align columns - add missing features as NaN (we'll handle in preprocessing)
all_features = set(df_synthetic.columns) | set(df_kaggle.columns)
all_features.discard('rockfall_risk')
all_features.discard('data_source')

for col in all_features:
    if col not in df_synthetic.columns:
        df_synthetic[col] = np.nan
    if col not in df_kaggle.columns:
        df_kaggle[col] = np.nan

# Combine datasets
df = pd.concat([df_synthetic, df_kaggle], ignore_index=True)

print(f"✓ Combined dataset created: {len(df)} total samples")
print(f"  - Synthetic samples: {len(df_synthetic)}")
print(f"  - Kaggle samples: {len(df_kaggle)}")
print(f"\n✓ Total features: {len([c for c in df.columns if c not in ['rockfall_risk', 'data_source']])}")

# ========================================================================
# PART E: Data Overview
# ========================================================================
print("\n" + "="*70)
print("COMBINED DATASET OVERVIEW")
print("="*70)

print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Information:")
df.info()

print("\nDescriptive Statistics:")
print(df.describe())

print("\nClass Distribution:")
print(df['rockfall_risk'].value_counts().sort_index())

print("\nData Source Distribution:")
print(df['data_source'].value_counts())

# Visualization: Class Distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.countplot(x='rockfall_risk', data=df, order=['Low', 'Medium', 'High', 'Critical'], palette='viridis')
plt.title('Combined Dataset - Risk Distribution', fontweight='bold')
plt.xlabel('Risk Category')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
risk_by_source = df.groupby(['data_source', 'rockfall_risk']).size().unstack(fill_value=0)
risk_by_source.plot(kind='bar', stacked=True, ax=plt.gca(), color=['#2ecc71', '#f39c12', '#e74c3c', '#c0392b'])
plt.title('Risk Distribution by Data Source', fontweight='bold')
plt.xlabel('Data Source')
plt.ylabel('Count')
plt.legend(title='Risk Level', bbox_to_anchor=(1.05, 1))
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

# Save the combined dataset
output_path = '../data/rockfall_data.csv'
df.to_csv(output_path, index=False)

print(f"\n✓ Combined dataset saved to {output_path}")
print("="*70)

ROCKFALL RISK PREDICTION - DATA GENERATION & INTEGRATION

[PART A] Generating Synthetic Rockfall Sensor Data...
----------------------------------------------------------------------
✓ Synthetic data generated: 10000 samples
  Features: ['seismic_activity', 'vibration_level', 'joint_water_pressure', 'displacement_mm', 'rainfall_mm']

[PART B] Loading Kaggle Landslide Dataset...
----------------------------------------------------------------------
⚠ Could not download from Kaggle: No module named 'opendatasets'
⚠ Creating sample Kaggle-like data instead...
✓ Sample Kaggle-like data created: 5000 samples
  Features: ['Temperature', 'Humidity', 'Rain', 'Moisture', 'Slope_Angle', 'Soil_Type', 'Landslide_Risk']


TypeError: Cannot setitem on a Categorical with a new category (Critical), set the categories first

# Enhanced Exploratory Data Analysis (EDA)

Now let's perform deeper analysis including correlation analysis, distribution analysis, and outlier detection.

In [None]:
# 1. Correlation Analysis - Heatmap
print("=== Correlation Analysis ===\n")

# Get numeric features only (exclude target and source)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
features = [col for col in numeric_cols if col not in ['rockfall_risk', 'risk_encoded']]

print(f"Analyzing {len(features)} numeric features:")
print(features)

# Calculate correlation matrix for features only
correlation_matrix = df[features].corr()

# Plot correlation heatmap
fig_size = max(10, len(features) * 0.8)
plt.figure(figsize=(fig_size, fig_size))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, annot_kws={'size': 8})
plt.title('Correlation Heatmap of All Features (Synthetic + Kaggle)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nInterpretation:")
print("- Strong positive correlations indicate features that tend to increase together")
print("- This helps identify redundant features or multicollinearity issues")
print("- Note: NaN values from missing features are excluded from correlation calculation")

In [None]:
# 2. Correlation with Target Variable
from sklearn.preprocessing import LabelEncoder

# Encode risk levels for correlation analysis
le = LabelEncoder()
df['risk_encoded'] = le.fit_transform(df['rockfall_risk'])

# Calculate correlation with encoded target
feature_target_corr = df[features + ['risk_encoded']].corr()['risk_encoded'].drop('risk_encoded').sort_values(ascending=False)

plt.figure(figsize=(10, 6))
feature_target_corr.plot(kind='barh', color='steelblue')
plt.title('Correlation of Features with Rockfall Risk', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Sensor Features')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.tight_layout()
plt.show()

print("\nFeature-Target Correlations:")
print(feature_target_corr)
print("\nInterpretation:")
print("- Features with higher correlation have stronger linear relationships with risk level")
print("- These are likely to be important predictors in our models")

In [None]:
# 3. Distribution Analysis - Histograms and KDE Plots
print("\n=== Distribution Analysis ===\n")

# Plot for features that have data
n_features = len(features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
axes = axes.ravel() if n_features > 1 else [axes]

for idx, feature in enumerate(features):
    # Only plot if feature has non-null values
    feature_data = df[feature].dropna()
    if len(feature_data) > 0:
        axes[idx].hist(feature_data, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
        feature_data.plot(kind='kde', ax=axes[idx], secondary_y=True, color='red', linewidth=2)
        axes[idx].set_title(f'Distribution of {feature}', fontweight='bold', fontsize=10)
        axes[idx].set_xlabel(feature, fontsize=9)
        axes[idx].set_ylabel('Frequency', fontsize=9)
        axes[idx].grid(alpha=0.3)

# Remove extra subplots
for idx in range(n_features, len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

print("Interpretation:")
print("- Histograms show the frequency distribution of each feature")
print("- KDE (red line) shows the probability density")
print("- Some features may show bimodal distributions (two peaks) due to data coming from two sources")

In [None]:
# 4. Box Plots by Risk Category
print("\n=== Distribution by Risk Category ===\n")

n_features = len(features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4 * n_rows))
axes = axes.ravel() if n_features > 1 else [axes]

risk_order = ['Low', 'Medium', 'High', 'Critical']

for idx, feature in enumerate(features):
    # Only plot if feature has sufficient non-null values
    feature_data = df[[feature, 'rockfall_risk']].dropna()
    if len(feature_data) > 0:
        sns.boxplot(data=feature_data, x='rockfall_risk', y=feature, order=risk_order, 
                    palette='Set2', ax=axes[idx])
        axes[idx].set_title(f'{feature} by Risk Category', fontweight='bold', fontsize=10)
        axes[idx].set_xlabel('Risk Level', fontsize=9)
        axes[idx].set_ylabel(feature, fontsize=9)
        axes[idx].grid(alpha=0.3, axis='y')
        axes[idx].tick_params(axis='x', rotation=45)

# Remove extra subplots
for idx in range(n_features, len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

print("Interpretation:")
print("- Box plots show median (line), quartiles (box), and outliers (dots) for each risk category")
print("- Clear separation between risk categories indicates the feature is a good predictor")
print("- Note: Some features only have data from one source (synthetic or Kaggle)")

In [None]:
# 5. Pair Plot - Visualizing Feature Relationships
print("\n=== Pair Plot Analysis ===\n")
print("Generating pair plot for key features (this may take a moment)...")

# Select top features with most complete data for pair plot (to avoid clutter)
feature_completeness = df[features].notna().sum().sort_values(ascending=False)
top_features = feature_completeness.head(5).index.tolist()

print(f"Plotting top 5 features with most complete data: {top_features}")

# Sample data for faster plotting
sample_size = min(1000, len(df))
df_sample = df[top_features + ['rockfall_risk']].dropna().sample(n=min(sample_size, len(df.dropna())), random_state=42)

if len(df_sample) > 50:  # Only plot if we have enough data
    pairplot = sns.pairplot(df_sample, 
                            hue='rockfall_risk', 
                            palette='Set1',
                            hue_order=risk_order,
                            diag_kind='kde',
                            plot_kws={'alpha': 0.6, 's': 30},
                            height=2.5)
    pairplot.fig.suptitle('Pair Plot: Top Features Colored by Risk Level', 
                          y=1.01, fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("\nInterpretation:")
    print("- Diagonal: Distribution of each feature by risk category")
    print("- Off-diagonal: Scatter plots showing relationships between feature pairs")
    print("- Good class separation indicates features work well together for prediction")
else:
    print("⚠ Not enough complete data for pair plot after removing NaN values")

In [None]:
# 6. Outlier Detection using IQR Method
print("\n=== Outlier Detection ===\n")

outlier_summary = {}

for feature in features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    outlier_count = len(outliers)
    outlier_percentage = (outlier_count / len(df)) * 100
    
    outlier_summary[feature] = {
        'Count': outlier_count,
        'Percentage': f'{outlier_percentage:.2f}%',
        'Lower_Bound': f'{lower_bound:.2f}',
        'Upper_Bound': f'{upper_bound:.2f}'
    }

outlier_df = pd.DataFrame(outlier_summary).T
print("Outlier Summary (using IQR method):")
print(outlier_df)

print("\n" + "="*60)
print("Decision on Outliers:")
print("="*60)
print("Since our data is synthetically generated with intentional high-risk scenarios,")
print("these 'outliers' represent critical conditions (high seismic activity, displacement, etc.)")
print("that are ESSENTIAL for predicting high/critical risk categories.")
print("\nAction: We will RETAIN all outliers as they contain important information")
print("about extreme conditions that lead to rockfall events.")
print("="*60)

# Handling Missing Values from Data Integration

In [None]:
# 7. Missing Value Analysis
print("\n=== Missing Value Analysis ===\n")

# Check missing values
missing_counts = df.isnull().sum()
missing_percentages = (missing_counts / len(df)) * 100

missing_df = pd.DataFrame({
    'Feature': missing_counts.index,
    'Missing_Count': missing_counts.values,
    'Missing_Percentage': missing_percentages.values
}).sort_values('Missing_Count', ascending=False)

print("Missing Values Summary:")
print(missing_df[missing_df['Missing_Count'] > 0])

# Visualize missing data pattern
plt.figure(figsize=(12, 6))
missing_data = missing_df[missing_df['Missing_Count'] > 0]

if len(missing_data) > 0:
    plt.barh(missing_data['Feature'], missing_data['Missing_Percentage'], color='coral')
    plt.xlabel('Missing Percentage (%)', fontweight='bold')
    plt.ylabel('Feature', fontweight='bold')
    plt.title('Missing Data by Feature', fontweight='bold', fontsize=14)
    plt.axvline(x=50, color='red', linestyle='--', label='50% threshold')
    plt.legend()
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("✓ No missing values detected!")

print("\n" + "="*70)
print("STRATEGY FOR MISSING VALUES:")
print("="*70)
print("Why we have missing values:")
print("  - Synthetic data has: seismic, vibration, water pressure, displacement, rainfall")
print("  - Kaggle data has: temperature, humidity, rain, moisture, slope angle, soil type")
print("  - Each source lacks features from the other")

print("\nHandling Strategy (in preprocessing notebook):")
print("  Option 1: Use only complete cases (rows with no NaN)")
print("  Option 2: Impute missing values with mean/median/mode")
print("  Option 3: Train separate models for each data source")
print("  Option 4: Use models that handle missing data (e.g., XGBoost)")

print("\nFor this project, we'll use OPTION 1 or 2 in the preprocessing phase.")
print("="*70)