In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import itertools
import random

np.random.seed(42)
random.seed(42)

# Dates setup (10 years daily)
end_date = datetime.strptime("2024-12-31", "%Y-%m-%d")
start_date = end_date - timedelta(days=10 * 365 - 1)
dates = [start_date + timedelta(days=i) for i in range(10 * 365)]

location_types = ['lowland', 'middleland', 'upland']

env_features = ['has_river', 'has_lake', 'has_poor_drainage', 'is_urban', 'is_deforested']
all_combinations = list(itertools.product([False, True], repeat=len(env_features)))

records = []

def classify_season(date):
    month = date.month
    if 3 <= month <= 5:
        return "Long Rainy Season"
    elif 9 <= month <= 11:
        return "Short Rainy Season"
    elif 6 <= month <= 8:
        return "Long Dry Season"
    else:
        return "Short Dry Season"

def sample_lat_lon(loc_type):
    if loc_type == 'lowland':
        lat = np.random.uniform(-2.9, -2.0)
        lon = np.random.uniform(28.8, 29.3)
    elif loc_type == 'middleland':
        lat = np.random.uniform(-2.0, -1.5)
        lon = np.random.uniform(29.3, 30.0)
    else:  # upland
        lat = np.random.uniform(-1.5, -1.0)
        lon = np.random.uniform(30.0, 30.9)
    return lat, lon

def norm(val, min_v, max_v):
    return (val - min_v) / (max_v - min_v)

for date in dates:
    season = classify_season(date)
    for loc_type in location_types:
        lat, lon = sample_lat_lon(loc_type)

        # Randomly sample 8 combos out of 32 for variability (reduce repetition)
        sampled_combos = random.sample(all_combinations, 8)

        for combo in sampled_combos:
            has_river, has_lake, has_poor_drainage, is_urban, is_deforested = combo

            # Base rainfall with season and location multipliers
            base_rain = np.random.gamma(shape=2.0, scale=10.0)
            season_mult = 1.3 if season in ["Long Rainy Season", "Short Rainy Season"] else 0.6
            loc_mult = {'lowland': 1.1, 'middleland': 1.0, 'upland': 0.7}[loc_type]
            rainfall = round(base_rain * season_mult * loc_mult, 1)

            water_level = round(min(max(0.5 + 0.02 * rainfall + np.random.normal(0, 0.15), 0.3), 3.5), 2)
            soil_moisture = round(min(max(30 + 0.7 * rainfall + np.random.normal(0, 4), 20), 100), 1)
            temp_c = round(np.random.normal(23, 3), 1)
            humidity = round(min(max(50 + 0.5 * rainfall + np.random.normal(0, 8), 30), 100), 1)
            wind_speed = round(min(max(np.random.normal(10, 3), 0), 20), 1)
            pressure = round(np.random.normal(1012, 5), 1)

            # Normalize features for flood prob calculation
            rain_norm = norm(rainfall, 0, 100)
            water_norm = norm(water_level, 0.3, 3.5)
            soil_norm = norm(soil_moisture, 20, 100)
            humidity_norm = norm(humidity, 30, 100)

            # Base flood prob with weighted normalized features
            flood_prob = (0.4 * rain_norm + 0.3 * water_norm + 0.2 * soil_norm + 0.1 * humidity_norm)

            # Location impact
            flood_prob += 0.15 if loc_type == 'lowland' else 0.08 if loc_type == 'middleland' else 0.03

            # Water bodies & environment impact with interactions
            flood_prob += 0.15 if has_river else 0
            flood_prob += 0.10 if has_lake else 0
            flood_prob += 0.03 if has_poor_drainage else 0
            flood_prob += 0.03 if is_urban else 0
            flood_prob += 0.02 if is_deforested else 0

            # Interaction: urban area near river greatly increases risk
            if is_urban and has_river:
                flood_prob += 0.07

            # Season effect
            flood_prob += 0.1 if season in ["Long Rainy Season", "Short Rainy Season"] else -0.1

            # Wind and water bodies boost
            if wind_speed > 15 and (has_river or has_lake):
                flood_prob += 0.05

            # Introduce random anomalies: 1% chance of spike or drop
            if np.random.rand() < 0.01:
                flood_prob += np.random.choice([0.3, -0.3])

            # Add noise
            flood_prob += np.random.normal(0, 0.005)

            # Clamp between 0.01 and 0.99
            flood_prob = min(max(flood_prob, 0.01), 0.99)

            # Binary flood label with threshold adjusted to balance classes
            flood = int(flood_prob > 0.45)

            # Occasionally insert missing data (2% chance) — but we will drop nulls later
            if np.random.rand() < 0.02:
                rainfall = np.nan

            records.append({
                "date": date,
                "season": season,
                "location_type": loc_type,
                "has_river": has_river,
                "has_lake": has_lake,
                "has_poor_drainage": has_poor_drainage,
                "is_urban": is_urban,
                "is_deforested": is_deforested,
                "rainfall_mm": rainfall,
                "water_level_m": water_level,
                "soil_moisture": soil_moisture,
                "temp_c": temp_c,
                "humidity": humidity,
                "wind_speed": wind_speed,
                "pressure": pressure,
                "flood_probability": round(flood_prob, 3),
                "flood": flood,
                "latitude": round(lat, 6),
                "longitude": round(lon, 6),
            })

df = pd.DataFrame(records)

# Remove rows with any missing values (nulls)
df = df.dropna()

# Add unique record ID after dropping nulls
df.reset_index(drop=True, inplace=True)
df.reset_index(inplace=True)
df.rename(columns={"index": "record_id"}, inplace=True)

# Ensure no exact 0 or 1 in flood_probability (just in case)
df['flood_probability'] = df['flood_probability'].apply(lambda x: max(min(x, 0.99), 0.01))

# Save dataset
df.to_csv("flood_simulation_10_years_improved.csv", index=False)

print("✅ Improved dataset saved with shape:", df.shape)
print("Flood probability min:", df['flood_probability'].min())
print("Flood probability max:", df['flood_probability'].max())
print("Any exact 0 or 1 in flood_probability?:", any(df['flood_probability'].isin([0, 1])))
print("Any nulls left?:", df.isnull().any().any())


✅ Improved dataset saved with shape: (85872, 20)
Flood probability min: 0.01
Flood probability max: 0.99
Any exact 0 or 1 in flood_probability?: False
Any nulls left?: False
