<a href="https://colab.research.google.com/github/Kevan123/AI4SIDS/blob/main/ai4sids_predictive_model_converted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""AI4SIDS Predictive Model


In [None]:
Automatically generated by Colab.


In [None]:
Original file is located at
    https://colab.research.google.com/drive/1xaybAlLzzFVl7L5LSGw9k-DCBa36pcVB
"""


In [None]:
# Re-import libraries and reload datasets
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer


In [None]:
# File paths
gauge_data_path = "/content/Updated_Caroni_River_Monthly_Gauge_Data.csv"
weather_data_path = "/content/Weather_Data_by_Sensor_ID_and_City.csv"
social_media_data_path = "/content/social_media_simulated_dataset_transformed_agg_only.xlsx"


In [None]:
# Load data
gauge_data = pd.read_csv(gauge_data_path)
weather_data = pd.read_csv(weather_data_path)
social_media_data = pd.read_excel(social_media_data_path)


In [None]:
# Rename key columns for merging
gauge_data.rename(columns={"Time_Sensor_ID": "time_sensor_id"}, inplace=True)
weather_data.rename(columns={"Time_Sensor_ID": "time_sensor_id"}, inplace=True)
social_media_data.rename(columns={"datestamp_gauge": "time_sensor_id"}, inplace=True)


In [None]:
# Add source prefixes
gauge_data = gauge_data.add_prefix("iot_")
weather_data = weather_data.add_prefix("wthr_")
social_media_data = social_media_data.add_prefix("sm_")


In [None]:
# Re-align key column names post-prefix
gauge_data.rename(columns={"iot_time_sensor_id": "time_sensor_id"}, inplace=True)
weather_data.rename(columns={"wthr_time_sensor_id": "time_sensor_id"}, inplace=True)
social_media_data.rename(columns={"sm_time_sensor_id": "time_sensor_id"}, inplace=True)


In [None]:
# Merge datasets
merged_data = gauge_data.merge(weather_data, on="time_sensor_id", how="outer")
merged_data = merged_data.merge(social_media_data, on="time_sensor_id", how="outer")


In [None]:
# Begin data cleaning and transformation
data = merged_data.copy()


In [None]:
# Impute numerical features
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
num_imputer = SimpleImputer(strategy="mean")
data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])


In [None]:
# Impute categorical features
categorical_cols = data.select_dtypes(include=["object"]).columns.tolist()
cat_imputer = SimpleImputer(strategy="most_frequent")
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])


In [None]:
# Convert target to binary


In [None]:
# Detect and convert the flood event column
flood_event_col = next((col for col in data.columns if "Flood Event" in col), None)
if flood_event_col:
    data[flood_event_col] = data[flood_event_col].map({"Yes": 1, "No": 0})
    data.rename(columns={flood_event_col: "Flood_Event"}, inplace=True)
else:
    raise KeyError("Column containing 'Flood Event' not found.")


In [None]:

# Parse timestamp and derive time-based features
data["iot_Timestamp"] = pd.to_datetime(data["iot_Timestamp"], errors="coerce")
data["date"] = data["iot_Timestamp"].dt.date
data["hour"] = data["iot_Timestamp"].dt.hour
data["dayofweek"] = data["iot_Timestamp"].dt.dayofweek
data["month"] = data["iot_Timestamp"].dt.month


In [None]:
# Ensure target and spatial info is present
data = data.dropna(subset=["iot_Latitude", "iot_Longitude", "Flood_Event"])


In [None]:
# Inject label noise (flip 3% of the flood labels)
def inject_label_noise(df, target_col="Flood_Event", noise_level=0.03, random_state=42):
    np.random.seed(random_state)
    noisy_df = df.copy()
    mask = np.random.rand(len(noisy_df)) < noise_level
    noisy_df.loc[mask, target_col] = 1 - noisy_df.loc[mask, target_col]
    return noisy_df


In [None]:
# Apply label noise injection
data = inject_label_noise(data, target_col="Flood_Event", noise_level=0.03)


In [None]:

# Now use df_noisy to train/test your classifier


In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, roc_auc_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
# Define updated feature columns
feature_cols = [
    "iot_River Level (m)", "iot_Change in Level (m)",
    "wthr_Actual Temperature (°C)", "wthr_Actual Humidity (%)",
    "wthr_Actual Rainfall (mm)", "wthr_Actual Windspeed (km/h)",
    "sm_Average of sentiment_score", "sm_Sum of distance_to_gauge_km",
    "sm_Count of post_id", "hour", "dayofweek", "month"
]


In [None]:
# Drop rows with missing values in any selected feature columns
data_model = data.dropna(subset=feature_cols)


In [None]:
# Define inputs and targets
X = data_model[feature_cols]
y_class = data_model["Flood_Event"]
y_lat = data_model["iot_Latitude"]
y_lon = data_model["iot_Longitude"]
y_hour = data_model["hour"]


In [None]:
# Split data for classification model
X_train, X_test, y_class_train, y_class_test = train_test_split(X, y_class, test_size=0.3, random_state=42)


In [None]:
# Classifier pipeline
clf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42))
])
clf_pipeline.fit(X_train, y_class_train)


In [None]:
# Classification predictions
y_class_pred = clf_pipeline.predict(X_test)
y_class_proba = clf_pipeline.predict_proba(X_test)[:, 1]
class_report = classification_report(y_class_test, y_class_pred, output_dict=True)
roc_auc = roc_auc_score(y_class_test, y_class_proba)


In [None]:
# Subset data for flood-only regression tasks
flood_data = data_model[data_model["Flood_Event"] == 1]
X_flood = flood_data[feature_cols]
y_flood_lat = flood_data["iot_Latitude"]
y_flood_lon = flood_data["iot_Longitude"]
y_flood_hour = flood_data["hour"]


In [None]:
# Train regressors
lat_model = RandomForestRegressor(random_state=42).fit(X_flood, y_flood_lat)
lon_model = RandomForestRegressor(random_state=42).fit(X_flood, y_flood_lon)
hour_model = RandomForestRegressor(random_state=42).fit(X_flood, y_flood_hour)


In [None]:
# Predict and calculate errors
lat_preds = lat_model.predict(X_flood)
lon_preds = lon_model.predict(X_flood)
hour_preds = hour_model.predict(X_flood)


In [None]:
lat_error = mean_absolute_error(y_flood_lat, lat_preds)
lon_error = mean_absolute_error(y_flood_lon, lon_preds)
hour_error = mean_absolute_error(y_flood_hour, hour_preds)


In [None]:
# Return evaluation results
evaluation_results = {
    "Flood Classifier ROC AUC": roc_auc,
    "Flood Classifier Report": class_report,
    "Latitude MAE": lat_error,
    "Longitude MAE": lon_error,
    "Hour MAE": hour_error
}


In [None]:
evaluation_results


{'Flood Classifier ROC AUC': np.float64(0.8186865590299067),
 'Flood Classifier Report': {'0': {'precision': 0.972594752186589,
   'recall': 0.9991015274034142,
   'f1-score': 0.9856699660215689,
   'support': 3339.0},
  '1': {'precision': 0.9788732394366197,
   'recall': 0.5965665236051502,
   'f1-score': 0.7413333333333333,
   'support': 233.0},
  'accuracy': 0.9728443449048152,
  'macro avg': {'precision': 0.9757339958116044,
   'recall': 0.7978340255042822,
   'f1-score': 0.863501649677451,
   'support': 3572.0},
  'weighted avg': {'precision': 0.973004295167904,
   'recall': 0.9728443449048152,
   'f1-score': 0.969731994180483,
   'support': 3572.0}},
 'Latitude MAE': 0.0028925684258075536,
 'Longitude MAE': 0.010751233325810327,
 'Hour MAE': 0.0}

In [None]:
# Extract report and convert to DataFrame
clf_report = evaluation_results["Flood Classifier Report"]
clf_df = pd.DataFrame(clf_report).T  # Transpose to get classes as rows


In [None]:
# Separate non-numeric values if needed
roc_auc = evaluation_results["Flood Classifier ROC AUC"]
lat_mae = evaluation_results["Latitude MAE"]
lon_mae = evaluation_results["Longitude MAE"]
hour_mae = evaluation_results["Hour MAE"]


In [None]:
# Display classifier report
print("=== Classification Report ===")
display(clf_df.round(4))  # If in notebook, else use print(clf_df)


=== Classification Report ===


Unnamed: 0,precision,recall,f1-score,support
0,0.9726,0.9991,0.9857,3339.0
1,0.9789,0.5966,0.7413,233.0
accuracy,0.9728,0.9728,0.9728,0.9728
macro avg,0.9757,0.7978,0.8635,3572.0
weighted avg,0.973,0.9728,0.9697,3572.0


In [None]:
# Display additional metrics
summary_df = pd.DataFrame({
    "Metric": ["ROC AUC", "Latitude MAE", "Longitude MAE", "Hour MAE"],
    "Value": [roc_auc, lat_mae, lon_mae, hour_mae]
})
print("\n=== Other Metrics ===")
display(summary_df)



=== Other Metrics ===


Unnamed: 0,Metric,Value
0,ROC AUC,0.818687
1,Latitude MAE,0.002893
2,Longitude MAE,0.010751
3,Hour MAE,0.0


In [None]:
# Use the previously merged and processed data
merged = merged_data.copy()


In [None]:
# Step 2: Correct sentiment and rainfall field names
merged = merged.rename(columns={
    "sm_Average of sentiment_score": "Average of sentiment_score",
    "wthr_Actual Rainfall (mm)": "Actual Rainfall (mm)",
    "iot_River Level (m)": "River Level (m)",
    "iot_Sensor ID": "Sensor ID",
    "iot_Latitude": "Latitude",
    "iot_Longitude": "Longitude",
    "sm_hourly_timestamp": "Timestamp"
})


In [None]:
# Step 3: Fill missing values with default assumptions
merged["Average of sentiment_score"] = merged["Average of sentiment_score"].fillna(0)
merged["Actual Rainfall (mm)"] = merged["Actual Rainfall (mm)"].fillna(0)
merged["River Level (m)"] = merged["River Level (m)"].fillna(0)


In [None]:
# Step 4: Simulate Flood Risk
def simulate_flood(row):
    flood = 0
    if (row["River Level (m)"] > 3.0) or (row["Actual Rainfall (mm)"] > 10):
        flood = 1
    elif (2.8 <= row["River Level (m)"] <= 3.0) and (row["Average of sentiment_score"] < -0.2):
        flood = np.random.choice([0, 1], p=[0.7, 0.3])
    return flood


In [None]:
merged["Flood Risk (Simulated)"] = merged.apply(simulate_flood, axis=1)


In [None]:
# Step 5: Final dataset formatting
final_merged = merged.rename(columns={
    "Average of sentiment_score": "Avg Sentiment",
    "Actual Rainfall (mm)": "Rainfall (mm)"
})[
    ["Timestamp", "Sensor ID", "Latitude", "Longitude",
     "Rainfall (mm)", "River Level (m)", "Avg Sentiment", "Flood Risk (Simulated)"]
]


In [None]:
# Save the new simulated dataset
simulated_flood_dataset_path = "/content/simulated_flood_risk_dataset.csv"
final_merged.to_csv(simulated_flood_dataset_path, index=False)


In [None]:
# Output path
simulated_flood_dataset_path


'/content/simulated_flood_risk_dataset.csv'

In [None]:
# Generate the future 7-day flood simulation dataset
import pandas as pd
import numpy as np
from datetime import timedelta


In [None]:
# Setup
np.random.seed(42)  # For reproducibility


In [None]:
# Define sensors with static lat/lon (Caroni River gauges)
sensors = {
    "CR-001": (10.5500, -61.3333),
    "CR-002": (10.6000, -61.3500),
    "CR-003": (10.6500, -61.4000),
    "CR-004": (10.6200, -61.3800),
    "CR-005": (10.5800, -61.3400),
    "CR-006": (10.5300, -61.3200),
    "CR-007": (10.6100, -61.3700),
    "CR-008": (10.5700, -61.3300),
}


In [None]:
# Generate timestamps: 1-hour intervals, 7 days starting April 1st, 2025
timestamps = pd.date_range(start="2025-04-01", periods=24*7, freq="H")


  timestamps = pd.date_range(start="2025-04-01", periods=24*7, freq="H")


In [None]:
# Create future simulated data
simulated_records = []
for sensor_id, (lat, lon) in sensors.items():
    for ts in timestamps:
        # Rainfall (higher at night 7PM–7AM)
        if ts.hour >= 19 or ts.hour <= 7:
            rainfall = np.random.normal(loc=8, scale=5)  # Higher rain at night
        else:
            rainfall = np.random.normal(loc=3, scale=2)  # Lower rain during day
        rainfall = max(0, rainfall)  # No negative rain


In [None]:
        # River Level (correlate slightly with rainfall)
        river_level = np.random.normal(loc=2.5 + 0.1*(rainfall/10), scale=0.3)
        river_level = max(0, river_level)


In [None]:
        # Avg Sentiment (worse with more rainfall)
        if rainfall > 10:
            avg_sentiment = np.random.normal(loc=-0.4, scale=0.2)
        else:
            avg_sentiment = np.random.normal(loc=0.0, scale=0.2)
        avg_sentiment = np.clip(avg_sentiment, -1, 1)


In [None]:
        # Simulate Flood Risk
        flood_risk = 0
        if river_level > 3.0 or rainfall > 10:
            flood_risk = 1
        elif (2.8 <= river_level <= 3.0) and avg_sentiment < -0.2:
            flood_risk = np.random.choice([0, 1], p=[0.7, 0.3])


In [None]:
        simulated_records.append({
            "Timestamp": ts,
            "Sensor ID": sensor_id,
            "Latitude": lat,
            "Longitude": lon,
            "Rainfall (mm)": round(rainfall, 2),
            "River Level (m)": round(river_level, 2),
            "Avg Sentiment": round(avg_sentiment, 2),
            "Flood Risk (Simulated)": flood_risk
        })


In [None]:
# Convert to DataFrame
simulated_future_df = pd.DataFrame(simulated_records)


In [None]:
# Save the future dataset
future_simulated_data_path = "/content/simulated_7days_flood_test_data.csv"
simulated_future_df.to_csv(future_simulated_data_path, index=False)


In [None]:
future_simulated_data_path


'/content/simulated_7days_flood_test_data.csv'

In [None]:
# Step 1: Load Training and Testing Datasets
train_path = "/content/simulated_flood_risk_dataset.csv"
test_path = "/content/simulated_7days_flood_test_data.csv"


In [None]:
# Load datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)


In [None]:
# Step 2: Feature Engineering
def engineer_features(df):
    df["Timestamp"] = pd.to_datetime(df["Timestamp"])
    df["hour"] = df["Timestamp"].dt.hour
    df["dayofweek"] = df["Timestamp"].dt.dayofweek
    return df


In [None]:
train_df = engineer_features(train_df)
test_df = engineer_features(test_df)


In [None]:
# Step 3: Define Features and Target
feature_cols = [
    "hour", "dayofweek", "Latitude", "Longitude",
    "Rainfall (mm)", "River Level (m)", "Avg Sentiment"
]
target_col = "Flood Risk (Simulated)"


In [None]:
X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]


In [None]:
# Step 4: Train a RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score


In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [None]:
# Step 5: Predict and Evaluate
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)[:, 1]


In [None]:
# Evaluation Metrics
clf_report = classification_report(y_test, y_pred, output_dict=True)
roc_auc = roc_auc_score(y_test, y_pred_prob)




In [None]:
# Save report
evaluation_results = {
    "Flood Classifier ROC AUC": roc_auc,
    "Flood Classifier Report": clf_report
}


In [None]:
evaluation_results


{'Flood Classifier ROC AUC': nan,
 'Flood Classifier Report': {'1': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 1.0},
  'accuracy': 1.0,
  'macro avg': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 1.0},
  'weighted avg': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 1.0}}}

In [None]:
merged_data.head(5)


Unnamed: 0,iot_Timestamp,iot_Sensor ID,time_sensor_id,iot_Latitude,iot_Longitude,iot_Location,iot_River Level (m),iot_Change in Level (m),iot_Flood Event,wthr_Timestamp,...,wthr_Actual Storm,wthr_Flood Event,sm_hourly_timestamp,sm_Average of sentiment_score,sm_Sum of distance_to_gauge_km,sm_Count of post_id,sm_Cunupia,sm_Piarco,sm_St. Augustine,sm_St. Helena
0,,,01-03-2025 00-00-00-CITY-Arima,,,,,,,3/1/2025 0:00,...,Cloudy,No,NaT,,,,,,,
1,,,01-03-2025 00-00-00-CITY-Chaguanas,,,,,,,3/1/2025 0:00,...,Clear,No,NaT,,,,,,,
2,,,01-03-2025 00-00-00-CITY-Couva,,,,,,,3/1/2025 0:00,...,Rainy,No,NaT,,,,,,,
3,,,01-03-2025 00-00-00-CITY-PointFortin,,,,,,,3/1/2025 0:00,...,Clear,No,NaT,,,,,,,
4,,,01-03-2025 00-00-00-CITY-PortofSpain,,,,,,,3/1/2025 0:00,...,Cloudy,No,NaT,,,,,,,


In [None]:
# Re-import all required libraries
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
# ---- Load and Prefix Data (Assumes DataFrames Already Loaded) ----


In [None]:
# Standardize merge keys
gauge_data.rename(columns={"Time_Sensor_ID": "time_sensor_id"}, inplace=True)
weather_data.rename(columns={"Time_Sensor_ID": "time_sensor_id"}, inplace=True)
social_media_data.rename(columns={"datestamp_gauge": "time_sensor_id"}, inplace=True)


In [None]:
# Prefix all columns for provenance
gauge_data = gauge_data.add_prefix("iot_")
weather_data = weather_data.add_prefix("wthr_")
social_media_data = social_media_data.add_prefix("sm_")


In [None]:
# Reassign time_sensor_id for merging
gauge_data.rename(columns={"iot_time_sensor_id": "time_sensor_id"}, inplace=True)
weather_data.rename(columns={"wthr_time_sensor_id": "time_sensor_id"}, inplace=True)
social_media_data.rename(columns={"sm_time_sensor_id": "time_sensor_id"}, inplace=True)


In [None]:
# Merge all sources into a unified dataset
merged_data = gauge_data.merge(weather_data, on="time_sensor_id", how="outer")
merged_data = merged_data.merge(social_media_data, on="time_sensor_id", how="outer")


In [None]:
# ---- Clean, Impute, and Feature Engineer ----
data = merged_data.copy()


In [None]:
# Impute numerics and categoricals
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(include=["object"]).columns
data[num_cols] = SimpleImputer(strategy="mean").fit_transform(data[num_cols])
data[cat_cols] = SimpleImputer(strategy="most_frequent").fit_transform(data[cat_cols])


In [None]:
# Encode flood event
# Changed line to correctly reference 'iot_Flood Event' column


In [None]:
# Detect and convert the flood event column
flood_event_col = next((col for col in data.columns if "Flood Event" in col), None)
if flood_event_col:
    data[flood_event_col] = data[flood_event_col].map({"Yes": 1, "No": 0})
    data.rename(columns={flood_event_col: "Flood_Event"}, inplace=True)
else:
    raise KeyError("Column containing 'Flood Event' not found.")


In [None]:

# Parse timestamps and extract date features
data["iot_Timestamp"] = pd.to_datetime(data["iot_iot_Timestamp"], errors="coerce")
data["date"] = data["iot_Timestamp"].dt.date
data["hour"] = data["iot_Timestamp"].dt.hour
data["dayofweek"] = data["iot_Timestamp"].dt.dayofweek
data["month"] = data["iot_Timestamp"].dt.month


In [None]:
# Filter valid rows for modeling
data = data.dropna(subset=["iot_iot_Latitude", "iot_iot_Longitude", "Flood_Event"])


In [None]:
# ---- Define Model Inputs ----
feature_cols = [
    "iot_River Level (m)", "iot_Change in Level (m)",
    "wthr_Actual Temperature (°C)", "wthr_Actual Humidity (%)",
    "wthr_Actual Rainfall (mm)", "wthr_Actual Windspeed (km/h)",
    "sm_Average of sentiment_score", "sm_Sum of distance_to_gauge_km",
    "sm_Count of post_id"
]
extra_cols = ["hour", "dayofweek", "month"]
model_features = feature_cols + extra_cols

# Check if the columns in model_features exist in the DataFrame
for col in model_features:
    if col not in data.columns:
        print(f"Column '{col}' not found in DataFrame")

Column 'iot_River Level (m)' not found in DataFrame
Column 'iot_Change in Level (m)' not found in DataFrame
Column 'wthr_Actual Temperature (°C)' not found in DataFrame
Column 'wthr_Actual Humidity (%)' not found in DataFrame
Column 'wthr_Actual Rainfall (mm)' not found in DataFrame
Column 'wthr_Actual Windspeed (km/h)' not found in DataFrame
Column 'sm_Average of sentiment_score' not found in DataFrame
Column 'sm_Sum of distance_to_gauge_km' not found in DataFrame
Column 'sm_Count of post_id' not found in DataFrame


In [None]:
# Ensure only existing columns are used in subset
existing_model_features = [col for col in model_features if col in data.columns]

# Optional: Warn if any expected columns are missing
missing = list(set(model_features) - set(existing_model_features))
if missing:
    print("⚠️ Missing columns in data:", missing)

# Drop only on existing feature columns
data_model = data.dropna(subset=existing_model_features)

⚠️ Missing columns in data: ['sm_Average of sentiment_score', 'wthr_Actual Rainfall (mm)', 'sm_Sum of distance_to_gauge_km', 'sm_Count of post_id', 'iot_River Level (m)', 'wthr_Actual Windspeed (km/h)', 'wthr_Actual Temperature (°C)', 'wthr_Actual Humidity (%)', 'iot_Change in Level (m)']


In [None]:
data_model.head(5)

Unnamed: 0,iot_iot_Timestamp,iot_iot_Sensor ID,time_sensor_id,iot_iot_Latitude,iot_iot_Longitude,iot_iot_Location,iot_iot_River Level (m),iot_iot_Change in Level (m),Flood_Event,wthr_wthr_Timestamp,...,sm_sm_Count of post_id,sm_sm_Cunupia,sm_sm_Piarco,sm_sm_St. Augustine,sm_sm_St. Helena,iot_Timestamp,date,hour,dayofweek,month
0,3/1/2025 0:00,CR-001,01-03-2025 00-00-00-CITY-Arima,10.601419,-61.372977,Caroni River Mouth,2.751186,-1.3e-05,0,3/1/2025 0:00,...,2.702703,0.68,0.687568,0.678919,0.656216,2025-03-01,2025-03-01,0,5,3
1,3/1/2025 0:00,CR-001,01-03-2025 00-00-00-CITY-Chaguanas,10.601419,-61.372977,Caroni River Mouth,2.751186,-1.3e-05,0,3/1/2025 0:00,...,2.702703,0.68,0.687568,0.678919,0.656216,2025-03-01,2025-03-01,0,5,3
2,3/1/2025 0:00,CR-001,01-03-2025 00-00-00-CITY-Couva,10.601419,-61.372977,Caroni River Mouth,2.751186,-1.3e-05,0,3/1/2025 0:00,...,2.702703,0.68,0.687568,0.678919,0.656216,2025-03-01,2025-03-01,0,5,3
3,3/1/2025 0:00,CR-001,01-03-2025 00-00-00-CITY-PointFortin,10.601419,-61.372977,Caroni River Mouth,2.751186,-1.3e-05,0,3/1/2025 0:00,...,2.702703,0.68,0.687568,0.678919,0.656216,2025-03-01,2025-03-01,0,5,3
4,3/1/2025 0:00,CR-001,01-03-2025 00-00-00-CITY-PortofSpain,10.601419,-61.372977,Caroni River Mouth,2.751186,-1.3e-05,0,3/1/2025 0:00,...,2.702703,0.68,0.687568,0.678919,0.656216,2025-03-01,2025-03-01,0,5,3


In [None]:
# Inputs and outputs
X = data_model[existing_model_features]
y_class = data_model["Flood_Event"]
y_lat = data_model["iot_iot_Latitude"]
y_lon = data_model["iot_iot_Longitude"]
y_hour = data_model["hour"]


In [None]:
# ---- Train Models ----
X_train, X_test, y_class_train, y_class_test = train_test_split(X, y_class, test_size=0.3, random_state=42)


In [None]:
# Pipeline for classification
clf_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(random_state=42))
])
clf_pipeline.fit(X_train, y_class_train)


In [None]:
# Train regressors on flood-only rows
flood_data = data_model[data_model["Flood_Event"] == 1]
X_flood = flood_data[existing_model_features]
lat_model = RandomForestRegressor(random_state=42).fit(X_flood, flood_data["iot_iot_Latitude"])
lon_model = RandomForestRegressor(random_state=42).fit(X_flood, flood_data["iot_iot_Longitude"])
hour_model = RandomForestRegressor(random_state=42).fit(X_flood, flood_data["hour"])


In [None]:
# ---- Simulate Future Data for April 1–7 ----
last_march = data[data["date"] >= pd.to_datetime("2025-03-25").date()]
feature_means = last_march[existing_model_features].mean()
feature_stds = last_march[existing_model_features].std()


In [None]:
future_dates = [pd.Timestamp("2025-04-01") + timedelta(hours=h) for h in range(0, 24 * 7)]
simulated_rows = []


In [None]:
for dt in future_dates:
    row = {
        "hour": dt.hour,
        "dayofweek": dt.dayofweek,
        "month": dt.month,
        "timestamp": dt
    }
    for col in existing_model_features:
        mean = feature_means[col]
        std = feature_stds[col]
        row[col] = np.random.normal(mean, std if not np.isnan(std) else 0.01)
    simulated_rows.append(row)


In [None]:
future_df = pd.DataFrame(simulated_rows)


In [None]:
# ---- Predict Flood Risk & Location ----
future_scaled = clf_pipeline.named_steps['scaler'].transform(future_df[existing_model_features])
flood_probabilities = clf_pipeline.named_steps['clf'].predict_proba(future_scaled)[:, 1]
future_df["Flood_Probability"] = flood_probabilities


In [None]:
# Subset and predict spatial + time attributes
flood_risk_df = future_df[future_df["Flood_Probability"] > 0.2].copy()
if not flood_risk_df.empty:
    flood_risk_df["Pred_Latitude"] = lat_model.predict(flood_risk_df[existing_model_features])
    flood_risk_df["Pred_Longitude"] = lon_model.predict(flood_risk_df[existing_model_features])
    flood_risk_df["Pred_Hour"] = hour_model.predict(flood_risk_df[existing_model_features])


In [None]:
flood_risk_df.head(5)

Unnamed: 0,hour,dayofweek,month,timestamp,Flood_Probability,Pred_Latitude,Pred_Longitude,Pred_Hour
11,24.144685,4.356356,3.0,2025-04-01 11:00:00,0.269144,10.590318,-61.340901,23.0
40,23.063693,3.789637,3.0,2025-04-02 16:00:00,0.269144,10.590318,-61.340901,23.0
87,22.750514,3.646399,3.0,2025-04-04 15:00:00,0.269144,10.590318,-61.340901,23.0
101,29.432115,4.159698,3.0,2025-04-05 05:00:00,0.269144,10.590318,-61.340901,23.0
138,22.856366,4.484807,3.0,2025-04-06 18:00:00,0.269144,10.590318,-61.340901,23.0
