In [21]:
%pip install pandas
%pip install -U scikit-learn
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.
Collecting scikit-learn
  Using cached scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Using cached scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Collecting joblib>=1.1.1 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

# ----------------------------
# Config
# ----------------------------
WINDOW_SIZE = 5  # use past 5 days
RANDOM_STATE = 8

# ----------------------------
# 1. Load Data
# ----------------------------
df = pd.read_csv("dataset/flood_features.csv", parse_dates=["date"],
                 date_parser=lambda x: pd.to_datetime(x, format="%m/%d/%Y"))
df.drop(columns=[".geo", "system:index"], inplace=True)  # drop index column if exists
flood_dates = pd.read_csv("flood_ground_truth.csv", parse_dates=["Flood Date"])

# Add flood label (binary)
df["is_flood"] = df["date"].isin(flood_dates["Flood Date"]).astype(int)

# Sort by time to maintain chronological order
df = df.sort_values("date").reset_index(drop=True)

# ----------------------------
# 2. Handle Missing / Fake Zero Values
# ----------------------------

# Define base features (original, non-lagged)
base_features = [
    "rainfall_3day_cumulative_mm",
    "rainfall_max_mm",
    "rainfall_mean_mm",
    "rainfall_std_mm",
    "soil_moisture_top10cm_mm",
    "subsurface_runoff_mm",
    "surface_runoff_mm"
]

# Replace 0 with NaN (only if 0 indicates missing)
for col in base_features:
    df[col] = df[col].replace(0, np.nan)

# Create 'day-of-year' for seasonal imputation
df['doy'] = df['date'].dt.dayofyear

# Impute NaNs using seasonal (day-of-year) mean
for col in base_features:
    seasonal_avg = df.groupby('doy')[col].transform('mean')
    df[col] = df[col].fillna(seasonal_avg)

# Drop the helper column
df.drop(columns=['doy'], inplace=True)

# ----------------------------
# 3. Create Temporal Lag Features
# ----------------------------
for lag in range(1, WINDOW_SIZE + 1):
    for col in base_features:
        df[f"{col}_lag{lag}"] = df[col].shift(lag)

# Drop initial rows that have NaNs due to lagging
df = df.dropna().reset_index(drop=True)

# ----------------------------
# 4. Prepare Train/Test Sets (Chronological Split)
# ----------------------------
split_index = int(len(df) * 0.8)
train_df = df.iloc[:split_index]
test_df = df.iloc[split_index:]

# X: use only lag features
X_train = train_df.drop(columns=["date", "is_flood"] + base_features)
X_test = test_df.drop(columns=["date", "is_flood"] + base_features)
y_train = train_df["is_flood"]
y_test = test_df["is_flood"]

  df = pd.read_csv("dataset/flood_features.csv", parse_dates=["date"],


In [None]:
def xgboost_classification_report(X_train, X_test, y_train, y_test):
    # ----------------------------
    # 5. Scale Features
    # ----------------------------
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # ----------------------------
    # 6. Train XGBoost Classifier
    # ----------------------------

    # Calculate class imbalance ratio
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

    xgb = XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=RANDOM_STATE
    )

    xgb.fit(X_train_scaled, y_train)

    # ----------------------------
    # 7. Evaluate Model
    # ----------------------------
    y_pred = xgb.predict(X_test_scaled)
    y_prob = xgb.predict_proba(X_test_scaled)[:, 1]

    print(classification_report(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

xgboost_classification_report(X_train, X_test, y_train, y_test)


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       1.00      0.93      0.96      1815
           1       0.04      0.45      0.07        11

    accuracy                           0.93      1826
   macro avg       0.52      0.69      0.52      1826
weighted avg       0.99      0.93      0.96      1826

ROC AUC Score: 0.9034810919108439


In [7]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp38-cp38-linux_x86_64.whl (798.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.9/798.9 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.19.1%2Bcu121-cp38-cp38-linux_x86_64.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.4.1%2Bcu121-cp38-cp38-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from torch)
  Downloading https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting s

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
print(torch.cuda.is_available())        # Should return True
print(torch.cuda.get_device_name(0))    # Should return something like "NVIDIA GeForce RTX 3060"

# ----------------------------
# 8. Build MLP Classifier
# ----------------------------
# Convert to torch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=32)