In [5]:
# rockfall-prediction-system/notebooks/02_data_preprocessing.py

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('../data/rockfall_data.csv')

# Separate Features (X) and Target (y)
X = df.drop('rockfall_risk', axis=1)
y = df['rockfall_risk']

# Encode the Target Variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print("Original Labels:", label_encoder.classes_)
print("Encoded Labels:", np.unique(y_encoded))


# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_encoded
)
print("\nShape of training features:", X_train.shape)
print("Shape of testing features:", X_test.shape)

# Save Processed Data
processed_data_dir = '../data/processed'
if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir)

X_train.to_csv(os.path.join(processed_data_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(processed_data_dir, 'X_test.csv'), index=False)
pd.Series(y_train).to_csv(os.path.join(processed_data_dir, 'y_train.csv'), index=False, header=['rockfall_risk'])
pd.Series(y_test).to_csv(os.path.join(processed_data_dir, 'y_test.csv'), index=False, header=['rockfall_risk'])

print("\nProcessed data saved successfully.")

Original Labels: ['Critical' 'High' 'Low' 'Medium']
Encoded Labels: [0 1 2 3]

Shape of training features: (4000, 5)
Shape of testing features: (1000, 5)

Processed data saved successfully.


# Feature Scaling Discussion

Before we proceed with model training, let's discuss why feature scaling is important and compare different scaling methods.

In [None]:
# Why Feature Scaling is Important

print("="*70)
print("IMPORTANCE OF FEATURE SCALING")
print("="*70)
print("\n1. DISTANCE-BASED ALGORITHMS (SVM, KNN):")
print("   - These algorithms use distances between data points")
print("   - Features with larger scales dominate the distance calculation")
print("   - Example: joint_water_pressure (50-450) vs seismic_activity (0.01-1.6)")
print("   - Without scaling, water pressure would dominate predictions")

print("\n2. GRADIENT DESCENT OPTIMIZATION (Logistic Regression, Neural Networks):")
print("   - Features on different scales cause slow/unstable convergence")
print("   - Scaling helps the optimization algorithm converge faster")

print("\n3. ALGORITHMS THAT DON'T REQUIRE SCALING:")
print("   - Tree-based models (Random Forest, Decision Tree)")
print("   - These use splits, not distances, so scale doesn't matter")

print("\n" + "="*70)
print("SCALING METHODS COMPARISON")
print("="*70)

print("\nStandardScaler (Z-score normalization):")
print("  Formula: z = (x - mean) / std_dev")
print("  Result: Mean=0, Std=1, range typically [-3, 3]")
print("  Best for: Data with Gaussian distribution, presence of outliers")

print("\nMinMaxScaler:")
print("  Formula: x_scaled = (x - min) / (max - min)")
print("  Result: Range [0, 1] (or custom range)")
print("  Best for: Bounded features, when you need a specific range")
print("  Sensitive to: Outliers (they can compress the majority of data)")

print("\n" + "="*70)
print("DECISION: We will use StandardScaler in our pipelines because:")
print("  1. Our data contains outliers (high-risk scenarios)")
print("  2. MinMaxScaler would compress most values due to extreme outliers")
print("  3. Works well with SVM and Logistic Regression")
print("="*70)

In [None]:
# Optional: Quick Comparison - StandardScaler vs MinMaxScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

print("\n" + "="*70)
print("SCALING COMPARISON EXPERIMENT")
print("="*70)

# Test with SVM (sensitive to scaling)
# StandardScaler
scaler_standard = StandardScaler()
X_train_standard = scaler_standard.fit_transform(X_train)
X_test_standard = scaler_standard.transform(X_test)

svm_standard = SVC(random_state=42)
svm_standard.fit(X_train_standard, y_train)
y_pred_standard = svm_standard.predict(X_test_standard)
acc_standard = accuracy_score(y_test, y_pred_standard)

# MinMaxScaler
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.transform(X_test)

svm_minmax = SVC(random_state=42)
svm_minmax.fit(X_train_minmax, y_train)
y_pred_minmax = svm_minmax.predict(X_test_minmax)
acc_minmax = accuracy_score(y_test, y_pred_minmax)

print(f"\nSVM with StandardScaler: {acc_standard:.4f}")
print(f"SVM with MinMaxScaler:    {acc_minmax:.4f}")
print(f"\nDifference: {abs(acc_standard - acc_minmax):.4f}")

if acc_standard > acc_minmax:
    print("\n✓ StandardScaler performs better (as expected with outliers)")
elif acc_minmax > acc_standard:
    print("\n✓ MinMaxScaler performs better (surprising given outliers!)")
else:
    print("\n✓ Both scalers perform equally well")

print("="*70)