# Flood Probability Prediction (Regression)
### Ensemble: Random Forest + XGBoost
Target: `FloodProbability` (continuous 0–1)
- No SMOTE (removed – not applicable to regression)
- 70% train | 10% validation | 20% test
- Full metrics + plots + feature importance

In [None]:
!pip install -q xgboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

%matplotlib inline
sns.set(style='whitegrid')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Research/Flood prediction database/flood.csv")
print(df.shape)
df.head()

In [None]:
X = df.drop('FloodProbability', axis=1)
y = df['FloodProbability']

# 70% train → remaining 30% → split into 10% val + 20% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val,   X_test, y_val,   y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

print(f"Train: {len(X_train)} | Validation: {len(X_val)} | Test: {len(X_test)}")

In [None]:
# Fit individual models so we can extract feature importances
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

# Ensemble (average of the two)
ensemble = VotingRegressor([('rf', rf), ('xgb', xgb)])
ensemble.fit(X_train, y_train)

print("Training completed!")

### Performance on Validation Set

In [None]:
y_val_pred = ensemble.predict(X_val)

val_mse  = mean_squared_error(y_val, y_val_pred)
val_mae  = mean_absolute_error(y_val, y_val_pred)
val_r2   = r2_score(y_val, y_val_pred)

print(f"Validation RMSE : {np.sqrt(val_mse):.6f}")
print(f"Validation MAE  : {val_mae:.6f}")
print(f"Validation R²   : {val_r2:.6f}")

### Performance on Test Set

In [None]:
y_test_pred = ensemble.predict(X_test)

test_mse  = mean_squared_error(y_test, y_test_pred)
test_mae  = mean_absolute_error(y_test, y_test_pred)
test_r2   = r2_score(y_test, y_test_pred)

print(f"Test RMSE : {np.sqrt(test_mse):.6f}")
print(f"Test MAE  : {test_mae:.6f}")
print(f"Test R²   : {test_r2:.6f}")

### Plots

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(18, 5))

# Validation
ax[0].scatter(y_val, y_val_pred, alpha=0.5, color='steelblue')
ax[0].plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
ax[0].set_xlabel('Actual')
ax[0].set_ylabel('Predicted')
ax[0].set_title(f'Validation R² = {val_r2:.4f}')

# Test
ax[1].scatter(y_test, y_test_pred, alpha=0.5, color='darkorange')
ax[1].plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
ax[1].set_xlabel('Actual')
ax[1].set_title(f'Test R² = {test_r2:.4f}')

# Residuals
residuals = y_test - y_test_pred
ax[2].scatter(y_test_pred, residuals, alpha=0.5, color='green')
ax[2].axhline(0, color='red', linestyle='--')
ax[2].set_xlabel('Predicted')
ax[2].set_ylabel('Residuals')
ax[2].set_title('Residual Plot (Test)')

plt.tight_layout()
plt.show()

### Feature Importance (Random Forest)

In [None]:
importances = rf.feature_importances_
feat_names = X.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices], y=feat_names[indices], palette='magma')
plt.title('Feature Importance – Random Forest')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()