In [1]:
# Cell 1: Import all required libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("viridis")

print("‚úÖ All libraries imported successfully!")

‚úÖ All libraries imported successfully!


In [2]:
# Cell 2: Set paths and configuration
csv_path = "datasets/latest_flood_dataset.csv"
model_dir = "models"

# Create model directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)

print(f"üìÅ Dataset path: {csv_path}")
print(f"üìÅ Model directory: {model_dir}")

üìÅ Dataset path: datasets/latest_flood_dataset.csv
üìÅ Model directory: models


In [3]:
# Cell 3: Load the dataset
print("üìÇ Loading dataset...")

if not os.path.exists(csv_path):
    print(f"‚ùå Dataset not found at {csv_path}")
    print("Please run create_validated_data.py first")
else:
    df = pd.read_csv(csv_path, parse_dates=['date'] if 'date' in pd.read_csv(csv_path, nrows=1).columns else None)
    print(f"‚úÖ Dataset loaded: {len(df):,} records")
    print(f"üìä Shape: {df.shape}")
    
    # Display dataset info
    print(f"\nüìä Dataset Overview:")
    print(f"   Cities: {df['city'].nunique()}")
    if 'date' in df.columns:
        print(f"   Date range: {df['date'].min()} to {df['date'].max()}")
    print(f"   Flood events: {df['flood_label'].sum():,} ({df['flood_label'].mean()*100:.2f}%)")
    
    # Show first few rows
    print("\nüëÄ First 5 rows:")
    display(df.head())

üìÇ Loading dataset...
‚úÖ Dataset loaded: 167,637 records
üìä Shape: (167637, 47)

üìä Dataset Overview:
   Cities: 51
   Date range: 2015-01-01 00:00:00 to 2023-12-31 00:00:00
   Flood events: 4,795 (2.86%)

üëÄ First 5 rows:


Unnamed: 0,date,city,region,country,year,month,season,day_of_year,rain,rain_3day,...,week_of_year,is_pre_monsoon,is_post_monsoon,rain_7day_vs_30day,geographic_region,monsoon_rain_7day,coastal_storm_risk,mountain_rain_risk,monthly_avg_pressure,pressure_anomaly
0,2015-01-01,Abbottabad,mountain,Pakistan,2015,1,winter,1,0.0,0.0,...,1,0,0,0.0,mountain,0.0,0,0,90.137312,0.192688
1,2015-01-02,Abbottabad,mountain,Pakistan,2015,1,winter,2,0.0,0.0,...,1,0,0,0.0,mountain,0.0,0,0,90.137312,0.062688
2,2015-01-03,Abbottabad,mountain,Pakistan,2015,1,winter,3,0.0,0.0,...,1,0,0,0.0,mountain,0.0,0,0,90.137312,-0.017312
3,2015-01-04,Abbottabad,mountain,Pakistan,2015,1,winter,4,0.0,0.0,...,1,0,0,0.0,mountain,0.0,0,0,90.137312,0.282688
4,2015-01-05,Abbottabad,mountain,Pakistan,2015,1,winter,5,0.0,0.0,...,2,0,0,0.0,mountain,0.0,0,0,90.137312,0.142688


In [4]:
# Cell 4: Select features for training

# Base features (always include)
base_features = [
    'rain', 'rain_3day', 'rain_7day', 'rain_15day', 'rain_30day',
    'pressure', 'pressure_change', 'pressure_3day_trend',
    'temp', 'temp_change',
    'humidity', 'humidity_change',
    'wind_speed',
    'month', 'day_of_year'
]

# Advanced features from feature_engineering.py
advanced_features = [
    'rain_intensity', 'consecutive_rain_days', 'heavy_rain_day',
    'rapid_pressure_drop', 'pressure_anomaly',
    'temp_3day_trend', 'extreme_heat', 'extreme_cold',
    'high_humidity',
    'is_monsoon_season', 'is_winter_rain_season',
    'monsoon_rain_7day'
]

# Define feature sets
feature_sets = {
    'base': base_features,
    'advanced': base_features + [f for f in advanced_features if f in df.columns],
    'all': [col for col in df.columns if col not in 
            ['date', 'city', 'region', 'country', 'flood_label', 'flood_severity', 
             'flood_type', 'data_source', 'year', 'season', 'geographic_region']]
}

# Choose which feature set to use
use_feature_set = 'advanced'  # Change to 'base' or 'all' if needed

# Get available features
available_features = [col for col in feature_sets[use_feature_set] if col in df.columns]

print(f"üîß Using {len(available_features)} features for training ({use_feature_set} set)")
print("\nüìã Features list:")
for i, feature in enumerate(available_features, 1):
    print(f"   {i:2d}. {feature}")

üîß Using 27 features for training (advanced set)

üìã Features list:
    1. rain
    2. rain_3day
    3. rain_7day
    4. rain_15day
    5. rain_30day
    6. pressure
    7. pressure_change
    8. pressure_3day_trend
    9. temp
   10. temp_change
   11. humidity
   12. humidity_change
   13. wind_speed
   14. month
   15. day_of_year
   16. rain_intensity
   17. consecutive_rain_days
   18. heavy_rain_day
   19. rapid_pressure_drop
   20. pressure_anomaly
   21. temp_3day_trend
   22. extreme_heat
   23. extreme_cold
   24. high_humidity
   25. is_monsoon_season
   26. is_winter_rain_season
   27. monsoon_rain_7day


In [5]:
# Cell 5: Prepare X and y
X = df[available_features]
y = df['flood_label']

# Check for class imbalance
class_ratio = y.mean()
print(f"\n‚öñÔ∏è Class balance: {class_ratio*100:.2f}% positive (flood) samples")
print(f"   Class 0 (No Flood): {(1-class_ratio)*100:.2f}%")
print(f"   Class 1 (Flood): {class_ratio*100:.2f}%")

# Handle any missing values
if X.isnull().sum().sum() > 0:
    print("\n‚ö†Ô∏è Missing values detected. Filling with column means...")
    X = X.fillna(X.mean())
    print("‚úÖ Missing values handled")
else:
    print("\n‚úÖ No missing values found")

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=available_features)

print(f"\nüìä Feature matrix shape: {X_scaled.shape}")


‚öñÔ∏è Class balance: 2.86% positive (flood) samples
   Class 0 (No Flood): 97.14%
   Class 1 (Flood): 2.86%

‚ö†Ô∏è Missing values detected. Filling with column means...
‚úÖ Missing values handled

üìä Feature matrix shape: (167637, 27)


In [6]:
# Cell 6: Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nüìä Data Split:")
print(f"   Training set: {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"   Test set: {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"   Training flood events: {y_train.sum():,} ({y_train.mean()*100:.2f}%)")
print(f"   Test flood events: {y_test.sum():,} ({y_test.mean()*100:.2f}%)")

# Compute class weights for imbalanced data
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
print(f"\n‚öñÔ∏è Class weights: {class_weight_dict}")



üìä Data Split:
   Training set: 134,109 samples (80.0%)
   Test set: 33,528 samples (20.0%)
   Training flood events: 3,836 (2.86%)
   Test flood events: 959 (2.86%)

‚öñÔ∏è Class weights: {np.int64(0): np.float64(0.5147229280050356), np.int64(1): np.float64(17.48031803962461)}


In [7]:
# Cell 7: Train Random Forest Classifier
print("\n" + "="*70)
print("üå≤ Training Random Forest Model...")
print("="*70)

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Train the model
rf_model.fit(X_train, y_train)
print("‚úÖ Random Forest training complete!")

# Evaluate Random Forest
rf_train_score = rf_model.score(X_train, y_train)
rf_test_score = rf_model.score(X_test, y_test)
rf_pred = rf_model.predict(X_test)
rf_proba = rf_model.predict_proba(X_test)[:, 1]
rf_auc = roc_auc_score(y_test, rf_proba)
rf_f1 = f1_score(y_test, rf_pred)

print(f"\nüìä Random Forest Results:")
print(f"   Training accuracy: {rf_train_score:.4f}")
print(f"   Test accuracy: {rf_test_score:.4f}")
print(f"   ROC-AUC score: {rf_auc:.4f}")
print(f"   F1 Score: {rf_f1:.4f}")


üå≤ Training Random Forest Model...
‚úÖ Random Forest training complete!

üìä Random Forest Results:
   Training accuracy: 0.9902
   Test accuracy: 0.9883
   ROC-AUC score: 0.9981
   F1 Score: 0.8269


In [8]:
# Cell 8: Train Gradient Boosting Classifier
print("\n" + "="*70)
print("üöÄ Training Gradient Boosting Model...")
print("="*70)

gb_model = GradientBoostingClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    random_state=42
)

# Train the model
gb_model.fit(X_train, y_train)
print("‚úÖ Gradient Boosting training complete!")

# Evaluate Gradient Boosting
gb_train_score = gb_model.score(X_train, y_train)
gb_test_score = gb_model.score(X_test, y_test)
gb_pred = gb_model.predict(X_test)
gb_proba = gb_model.predict_proba(X_test)[:, 1]
gb_auc = roc_auc_score(y_test, gb_proba)
gb_f1 = f1_score(y_test, gb_pred)

print(f"\nüìä Gradient Boosting Results:")
print(f"   Training accuracy: {gb_train_score:.4f}")
print(f"   Test accuracy: {gb_test_score:.4f}")
print(f"   ROC-AUC score: {gb_auc:.4f}")
print(f"   F1 Score: {gb_f1:.4f}")


üöÄ Training Gradient Boosting Model...
‚úÖ Gradient Boosting training complete!

üìä Gradient Boosting Results:
   Training accuracy: 0.9996
   Test accuracy: 0.9978
   ROC-AUC score: 0.9992
   F1 Score: 0.9608


In [None]:
# Cell 9: Train XGBoost (if available)
try:
    
    # Train the model
    
    # Evaluate XGBoost
    
    # Store models for comparison
    models = {
        'RandomForest': rf_model,
        'GradientBoosting': gb_model,
        
    }
    
    results = {
        'RandomForest': {'train_acc': rf_train_score, 'test_acc': rf_test_score, 'auc': rf_auc, 'f1': rf_f1},
        'GradientBoosting': {'train_acc': gb_train_score, 'test_acc': gb_test_score, 'auc': gb_auc, 'f1': gb_f1},
        
    }
    
except ImportError:
    print("\n‚ö†Ô∏è XGBoost not installed. Using only Random Forest and Gradient Boosting.")
    models = {
        'RandomForest': rf_model,
        'GradientBoosting': gb_model
    }
    
    results = {
        'RandomForest': {'train_acc': rf_train_score, 'test_acc': rf_test_score, 'auc': rf_auc, 'f1': rf_f1},
        'GradientBoosting': {'train_acc': gb_train_score, 'test_acc': gb_test_score, 'auc': gb_auc, 'f1': gb_f1}
    }

SyntaxError: invalid syntax (1786203847.py, line 13)