# Feature Engineering
In this notebook, the features to model are created.

## Import Libraries

In [2]:
# Set configuration for notebook
import os

os.chdir("c:\\Users\\Spectra\\flood-prediction")
os.getcwd()

'c:\\Users\\Spectra\\flood-prediction'

In [3]:
import pandas as pd
import numpy as np
from src.utils import load_config
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [63]:
# Load the config
config = load_config()

# Load the data

In [22]:
df = pd.read_csv(config["data"]["processed_data_path"])

In [81]:
df.columns

Index(['Station_Names', 'Year', 'Month', 'Max_Temp', 'Min_Temp', 'Rainfall',
       'Relative_Humidity', 'Wind_Speed', 'Cloud_Coverage', 'Bright_Sunshine',
       'Station_Number', 'LATITUDE', 'LONGITUDE', 'ALT', 'Period', 'Flood',
       'Avg_Temp', 'rain_latitude', 'rain_longitude'],
      dtype='object')

## Feature Engineering

**Drop highly correlated columns**

In [24]:
df.drop(["X_COR", "Y_COR", "Sl"], axis=1, inplace=True)

**Create Average Temperature**


In [25]:
df["Avg_Temp"] = (df["Max_Temp"] + df["Min_Temp"]) / 2

**Create spatial features**


In [28]:
# Create interaction features
df["rain_latitude"] = df["Rainfall"] * df["LATITUDE"]
df["rain_longitude"] = df["Rainfall"] * df["LONGITUDE"]

**Create distance to waterbodies feature**

In [26]:
station_df = df[["Station_Names", "LATITUDE", "LONGITUDE"]].drop_duplicates()

In [29]:
import geopandas as gpd


# Assuming df is your DataFrame with latitude and longitude columns
# Create GeoDataFrame from DataFrame
gdf = gpd.GeoDataFrame(
    station_df, geometry=gpd.points_from_xy(station_df.LONGITUDE, station_df.LATITUDE)
)

# Load water body GeoDataFrame from OpenStreetMap
try:
    water_bodies = gpd.read_file("data/raw/bgd_hyd_rivers_lged.shp")
except Exception as e:
    # If there's an error reading the shapefile, attempt to recreate the index file
    water_bodies = gpd.read_file("data/raw/bgd_hyd_rivers_lged.shp")
    # Create spatial index
    water_bodies.sindex

# Calculate distances to nearest water body
gdf["dist_to_water"] = gdf.geometry.apply(lambda x: water_bodies.distance(x).min())

# Assign the calculated distances back to the original DataFrame
station_df["dist_to_water"] = gdf["dist_to_water"]

In [30]:
station_df

Unnamed: 0,Station_Names,LATITUDE,LONGITUDE,dist_to_water
0,Barisal,22.7,90.36,0.00395
780,Bhola,22.7,90.66,0.021247
1356,Bogra,24.88,89.36,0.00355
2148,Chandpur,23.26,90.67,0.058314
2748,Chittagong (City-Ambagan),22.35,91.8166,0.029867
2796,Chittagong (IAP-Patenga),22.34,91.79,0.034664
3576,Comilla,23.48,91.19,0.001917
4368,Cox's Bazar,21.46,91.98,0.002844
5160,Dhaka,23.78,90.39,0.046219
5892,Dinajpur,25.63,88.66,0.011919


In [103]:
# Merge this to the original df
merged_df = pd.merge(df, station_df, on="Station_Names", how="left")
merged_df.drop(["LATITUDE_x", "LONGITUDE_x"], axis=1, inplace=True)

In [104]:
merged_df

Unnamed: 0,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,ALT,Period,Flood,Avg_Temp,rain_latitude,rain_longitude,LATITUDE_y,LONGITUDE_y,dist_to_water
0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,4,1949.01,0,20.85,0.00,0.00,22.70,90.36,0.003950
1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,4,1949.02,0,24.55,204.30,813.24,22.70,90.36,0.003950
2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,4,1949.03,0,28.45,181.60,722.88,22.70,90.36,0.003950
3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,4,1949.04,0,28.90,3178.00,12650.40,22.70,90.36,0.003950
4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,4,1949.05,0,30.30,4925.90,19608.12,22.70,90.36,0.003950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,4,2013.08,1,28.95,17760.37,78513.26,20.87,92.26,0.001473
20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,4,2013.09,0,29.15,6866.23,30353.54,20.87,92.26,0.001473
20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,4,2013.10,0,28.95,5655.77,25002.46,20.87,92.26,0.001473
20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,4,2013.11,0,26.25,0.00,0.00,20.87,92.26,0.001473


In [127]:
station_df = merged_df[
    [
        "Station_Names",
        "LATITUDE_y",
        "LONGITUDE_y",
        "ALT",
        "dist_to_water",
        "Station_Number",
    ]
].drop_duplicates()
station_df

Unnamed: 0,Station_Names,LATITUDE_y,LONGITUDE_y,ALT,dist_to_water,Station_Number
0,Barisal,22.7,90.36,4,0.00395,41950
780,Bhola,22.7,90.66,5,0.021247,41951
1356,Bogra,24.88,89.36,20,0.00355,41883
2148,Chandpur,23.26,90.67,7,0.058314,41941
2748,Chittagong (City-Ambagan),22.35,91.8166,0,0.029867,41977
2796,Chittagong (IAP-Patenga),22.34,91.79,6,0.034664,41978
3576,Comilla,23.48,91.19,10,0.001917,41933
4368,Cox's Bazar,21.46,91.98,4,0.002844,41992
5160,Dhaka,23.78,90.39,9,0.046219,41923
5892,Dinajpur,25.63,88.66,37,0.011919,41863


In [128]:
station_df.to_csv("data/reference_data/reference.csv", index=False)

**Get river discharge data**

In [42]:
river_discharge_df = pd.read_csv("data/raw/river_discharge.csv")

In [43]:
river_discharge_df

Unnamed: 0,date,river_discharge,LATITUDE,LONGITUDE,Station_Names
0,1985-01-31 00:00:00+00:00,0.217383,22.70,90.36,Barisal
1,1985-02-28 00:00:00+00:00,0.186330,22.70,90.36,Barisal
2,1985-03-31 00:00:00+00:00,0.261097,22.70,90.36,Barisal
3,1985-04-30 00:00:00+00:00,0.240555,22.70,90.36,Barisal
4,1985-05-31 00:00:00+00:00,0.846616,22.70,90.36,Barisal
...,...,...,...,...,...
11512,2013-09-30 00:00:00+00:00,3.317700,20.87,92.26,Teknaf
11513,2013-10-31 00:00:00+00:00,2.120073,20.87,92.26,Teknaf
11514,2013-11-30 00:00:00+00:00,0.810497,20.87,92.26,Teknaf
11515,2013-12-31 00:00:00+00:00,0.285147,20.87,92.26,Teknaf


In [46]:
# Extract year and month from date
river_discharge_df["date"] = pd.to_datetime(river_discharge_df["date"])
river_discharge_df["Month"] = river_discharge_df["date"].dt.month
river_discharge_df["Year"] = river_discharge_df["date"].dt.year

In [47]:
river_discharge_df

Unnamed: 0,date,river_discharge,LATITUDE,LONGITUDE,Station_Names,Month,Year
0,1985-01-31 00:00:00+00:00,0.217383,22.70,90.36,Barisal,1,1985
1,1985-02-28 00:00:00+00:00,0.186330,22.70,90.36,Barisal,2,1985
2,1985-03-31 00:00:00+00:00,0.261097,22.70,90.36,Barisal,3,1985
3,1985-04-30 00:00:00+00:00,0.240555,22.70,90.36,Barisal,4,1985
4,1985-05-31 00:00:00+00:00,0.846616,22.70,90.36,Barisal,5,1985
...,...,...,...,...,...,...,...
11512,2013-09-30 00:00:00+00:00,3.317700,20.87,92.26,Teknaf,9,2013
11513,2013-10-31 00:00:00+00:00,2.120073,20.87,92.26,Teknaf,10,2013
11514,2013-11-30 00:00:00+00:00,0.810497,20.87,92.26,Teknaf,11,2013
11515,2013-12-31 00:00:00+00:00,0.285147,20.87,92.26,Teknaf,12,2013


In [52]:
# Get data from 1985
from_1985_df = merged_df.loc[merged_df["Year"] >= 1985]
from_1985_df

Unnamed: 0,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,ALT,Period,Flood,Avg_Temp,rain_latitude,rain_longitude,dist_to_water
432,Barisal,1985,1,30.2,12.6,1.0,84.0,0.2,1.4,7.5,41950,4,1985.01,0,21.40,22.70,90.36,0.003950
433,Barisal,1985,2,31.4,14.3,2.0,77.0,0.6,1.1,8.6,41950,4,1985.02,0,22.85,45.40,180.72,0.003950
434,Barisal,1985,3,36.7,22.4,45.0,81.0,1.8,3.6,7.5,41950,4,1985.03,0,29.55,1021.50,4066.20,0.003950
435,Barisal,1985,4,36.6,24.6,64.0,81.0,1.9,4.7,8.4,41950,4,1985.04,0,30.60,1452.80,5783.04,0.003950
436,Barisal,1985,5,35.0,24.2,226.0,88.0,1.7,5.1,7.9,41950,4,1985.05,1,29.60,5130.20,20421.36,0.003950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.5,6.2,3.8,41998,4,2013.08,1,28.95,17760.37,78513.26,0.001473
20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.0,6.1,4.2,41998,4,2013.09,0,29.15,6866.23,30353.54,0.001473
20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.5,4.4,5.6,41998,4,2013.10,0,28.95,5655.77,25002.46,0.001473
20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.1,1.7,8.4,41998,4,2013.11,0,26.25,0.00,0.00,0.001473


In [53]:
# Merge the river discharge with the from 1985 df
full_merged_df = pd.merge(
    from_1985_df, river_discharge_df, on=["Month", "Year", "Station_Names"], how="inner"
)
full_merged_df

Unnamed: 0,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,...,Period,Flood,Avg_Temp,rain_latitude,rain_longitude,dist_to_water,date,river_discharge,LATITUDE,LONGITUDE
0,Barisal,1985,1,30.2,12.6,1.0,84.0,0.2,1.4,7.5,...,1985.01,0,21.40,22.70,90.36,0.003950,1985-01-31 00:00:00+00:00,0.217383,22.70,90.36
1,Barisal,1985,2,31.4,14.3,2.0,77.0,0.6,1.1,8.6,...,1985.02,0,22.85,45.40,180.72,0.003950,1985-02-28 00:00:00+00:00,0.186330,22.70,90.36
2,Barisal,1985,3,36.7,22.4,45.0,81.0,1.8,3.6,7.5,...,1985.03,0,29.55,1021.50,4066.20,0.003950,1985-03-31 00:00:00+00:00,0.261097,22.70,90.36
3,Barisal,1985,4,36.6,24.6,64.0,81.0,1.9,4.7,8.4,...,1985.04,0,30.60,1452.80,5783.04,0.003950,1985-04-30 00:00:00+00:00,0.240555,22.70,90.36
4,Barisal,1985,5,35.0,24.2,226.0,88.0,1.7,5.1,7.9,...,1985.05,1,29.60,5130.20,20421.36,0.003950,1985-05-31 00:00:00+00:00,0.846616,22.70,90.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11107,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.5,6.2,3.8,...,2013.08,1,28.95,17760.37,78513.26,0.001473,2013-08-31 00:00:00+00:00,7.071018,20.87,92.26
11108,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.0,6.1,4.2,...,2013.09,0,29.15,6866.23,30353.54,0.001473,2013-09-30 00:00:00+00:00,3.317700,20.87,92.26
11109,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.5,4.4,5.6,...,2013.10,0,28.95,5655.77,25002.46,0.001473,2013-10-31 00:00:00+00:00,2.120073,20.87,92.26
11110,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.1,1.7,8.4,...,2013.11,0,26.25,0.00,0.00,0.001473,2013-11-30 00:00:00+00:00,0.810497,20.87,92.26


**Select the best features**

In [88]:
# Separate features (X) and target variable (y)
X = full_merged_df.drop(
    columns=["Flood", "Station_Names", "date", "Period"]
)  # Features are all columns except "Flood"
y = full_merged_df["Flood"]  # Target variable is "Flood"

# Initialize a classifier
clf = XGBClassifier()

# Initialize RFE with the classifier and number of desired features
rfe = RFE(estimator=clf, n_features_to_select=10)

# Fit RFE to your full_merged_df
rfe.fit(X, y)

# Get the selected features
selected_features = X.columns[rfe.support_]

print("Selected Features:")
print(selected_features)

Selected Features:
Index(['Month', 'Rainfall', 'Relative_Humidity', 'Station_Number', 'ALT',
       'rain_latitude', 'rain_longitude', 'dist_to_water', 'river_discharge',
       'LONGITUDE'],
      dtype='object')


In [131]:
# Instantiate features set
train_features_set = [
    "Min_Temp",
    "Rainfall",
    "Cloud_Coverage",
    "Station_Number",
    "ALT",
    "rain_latitude",
    "rain_longitude",
    "dist_to_water",
    "Flood",
]

We can now train the model using the set of features obtained

## Split the data

In [132]:
# Split the data
modelling_data = merged_df[train_features_set]

train, test = train_test_split(
    modelling_data, test_size=0.2, random_state=42, stratify=modelling_data["Flood"]
)

In [130]:
# Save the data
train.to_csv(config["data"]["train_data_path"], index=False)
test.to_csv(config["data"]["test_data_path"], index=False)

**Test with additional data**

In [66]:
# Get station data and the rain data
station_df = pd.read_csv(config["data"]["reference_data_path"])
rain_df = pd.read_csv("data/raw/rain_data.csv")

In [67]:
station_df

Unnamed: 0,Station_Names,LATITUDE_y,LONGITUDE_y,ALT,dist_to_water,Station_Number
0,Barisal,22.7,90.36,4,0.00395,41950
1,Bhola,22.7,90.66,5,0.021247,41951
2,Bogra,24.88,89.36,20,0.00355,41883
3,Chandpur,23.26,90.67,7,0.058314,41941
4,Chittagong (City-Ambagan),22.35,91.8166,0,0.029867,41977
5,Chittagong (IAP-Patenga),22.34,91.79,6,0.034664,41978
6,Comilla,23.48,91.19,10,0.001917,41933
7,Cox's Bazar,21.46,91.98,4,0.002844,41992
8,Dhaka,23.78,90.39,9,0.046219,41923
9,Dinajpur,25.63,88.66,37,0.011919,41863


In [68]:
rain_df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude
0,0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.36
1,1,1950-01-31 00:00:00+00:00,0.600000,22.70,90.36
2,2,1950-02-28 00:00:00+00:00,107.100006,22.70,90.36
3,3,1950-03-31 00:00:00+00:00,138.599990,22.70,90.36
4,4,1950-04-30 00:00:00+00:00,112.300000,22.70,90.36
...,...,...,...,...,...
29365,29365,2023-09-30 00:00:00+00:00,411.300000,20.87,92.26
29366,29366,2023-10-31 00:00:00+00:00,144.200000,20.87,92.26
29367,29367,2023-11-30 00:00:00+00:00,153.000000,20.87,92.26
29368,29368,2023-12-31 00:00:00+00:00,55.099990,20.87,92.26


In [69]:
# Extract year and month from date
rain_df["date"] = pd.to_datetime(rain_df["date"])
rain_df["Month"] = rain_df["date"].dt.month
rain_df["Year"] = rain_df["date"].dt.year

In [70]:
rain_df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude,Month,Year
0,0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.36,12,1949
1,1,1950-01-31 00:00:00+00:00,0.600000,22.70,90.36,1,1950
2,2,1950-02-28 00:00:00+00:00,107.100006,22.70,90.36,2,1950
3,3,1950-03-31 00:00:00+00:00,138.599990,22.70,90.36,3,1950
4,4,1950-04-30 00:00:00+00:00,112.300000,22.70,90.36,4,1950
...,...,...,...,...,...,...,...
29365,29365,2023-09-30 00:00:00+00:00,411.300000,20.87,92.26,9,2023
29366,29366,2023-10-31 00:00:00+00:00,144.200000,20.87,92.26,10,2023
29367,29367,2023-11-30 00:00:00+00:00,153.000000,20.87,92.26,11,2023
29368,29368,2023-12-31 00:00:00+00:00,55.099990,20.87,92.26,12,2023


In [71]:
# Rename the columns
station_df.rename(
    columns={"LATITUDE_y": "latitude", "LONGITUDE_y": "longitude"}, inplace=True
)

In [72]:
# Merge the dfs

rain_merged_df = pd.merge(
    rain_df, station_df, on=["latitude", "longitude"], how="inner"
)

In [73]:
rain_merged_df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude,Month,Year,Station_Names,ALT,dist_to_water,Station_Number
0,0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.36,12,1949,Barisal,4,0.003950,41950
1,1,1950-01-31 00:00:00+00:00,0.600000,22.70,90.36,1,1950,Barisal,4,0.003950,41950
2,2,1950-02-28 00:00:00+00:00,107.100006,22.70,90.36,2,1950,Barisal,4,0.003950,41950
3,3,1950-03-31 00:00:00+00:00,138.599990,22.70,90.36,3,1950,Barisal,4,0.003950,41950
4,4,1950-04-30 00:00:00+00:00,112.300000,22.70,90.36,4,1950,Barisal,4,0.003950,41950
...,...,...,...,...,...,...,...,...,...,...,...
29365,29365,2023-09-30 00:00:00+00:00,411.300000,20.87,92.26,9,2023,Teknaf,4,0.001473,41998
29366,29366,2023-10-31 00:00:00+00:00,144.200000,20.87,92.26,10,2023,Teknaf,4,0.001473,41998
29367,29367,2023-11-30 00:00:00+00:00,153.000000,20.87,92.26,11,2023,Teknaf,4,0.001473,41998
29368,29368,2023-12-31 00:00:00+00:00,55.099990,20.87,92.26,12,2023,Teknaf,4,0.001473,41998


In [74]:
# Get original df
original_df = pd.read_csv(config["data"]["processed_data_path"])

In [75]:
original_df

Unnamed: 0,Sl,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,X_COR,Y_COR,LATITUDE,LONGITUDE,ALT,Period,Flood
0,0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,536809.8,510151.9,22.70,90.36,4,1949.01,0
1,1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,536809.8,510151.9,22.70,90.36,4,1949.02,0
2,2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,536809.8,510151.9,22.70,90.36,4,1949.03,0
3,3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,536809.8,510151.9,22.70,90.36,4,1949.04,0
4,4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,536809.8,510151.9,22.70,90.36,4,1949.05,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,734765.4,308914.1,20.87,92.26,4,2013.08,1
20540,20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,734765.4,308914.1,20.87,92.26,4,2013.09,0
20541,20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,734765.4,308914.1,20.87,92.26,4,2013.10,0
20542,20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,734765.4,308914.1,20.87,92.26,4,2013.11,0


In [76]:
# Get subset
subset_df = original_df[["Year", "Month", "Station_Names", "Flood"]]

# Merge the datasets
rain_with_label_df = pd.merge(
    rain_merged_df, subset_df, on=["Year", "Month", "Station_Names"], how="outer"
)

In [77]:
rain_with_label_df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude,Month,Year,Station_Names,ALT,dist_to_water,Station_Number,Flood
0,,NaT,,,,1,1948,Bogra,,,,0.0
1,,NaT,,,,1,1948,Comilla,,,,0.0
2,,NaT,,,,1,1948,Cox's Bazar,,,,0.0
3,,NaT,,,,1,1948,Dinajpur,,,,0.0
4,,NaT,,,,1,1948,Faridpur,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
29617,25809.0,2024-01-31 00:00:00+00:00,13.700002,22.64,91.64,1,2024,Sitakunda,4.0,0.002559,41965.0,
29618,26699.0,2024-01-31 00:00:00+00:00,0.800000,24.29,91.73,1,2024,Srimangal,23.0,0.013079,41915.0,
29619,27589.0,2024-01-31 00:00:00+00:00,0.200000,24.88,91.93,1,2024,Sylhet,35.0,0.004670,41891.0,
29620,28479.0,2024-01-31 00:00:00+00:00,1.000000,24.15,89.55,1,2024,Tangail,10.0,0.003868,41909.0,


In [78]:
# Drop rows with NaN values in the "rain" column
df = rain_with_label_df.dropna(subset=["rain"])

# Reset index after dropping rows
df.reset_index(drop=True, inplace=True)

In [79]:
df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude,Month,Year,Station_Names,ALT,dist_to_water,Station_Number,Flood
0,0.0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.3600,12,1949,Barisal,4.0,0.003950,41950.0,0.0
1,890.0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.6600,12,1949,Bhola,5.0,0.021247,41951.0,
2,1780.0,1949-12-31 00:00:00+00:00,0.000000,24.88,89.3600,12,1949,Bogra,20.0,0.003550,41883.0,0.0
3,2670.0,1949-12-31 00:00:00+00:00,0.000000,23.26,90.6700,12,1949,Chandpur,7.0,0.058314,41941.0,
4,3560.0,1949-12-31 00:00:00+00:00,0.000000,22.35,91.8166,12,1949,Chittagong (City-Ambagan),0.0,0.029867,41977.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
29365,25809.0,2024-01-31 00:00:00+00:00,13.700002,22.64,91.6400,1,2024,Sitakunda,4.0,0.002559,41965.0,
29366,26699.0,2024-01-31 00:00:00+00:00,0.800000,24.29,91.7300,1,2024,Srimangal,23.0,0.013079,41915.0,
29367,27589.0,2024-01-31 00:00:00+00:00,0.200000,24.88,91.9300,1,2024,Sylhet,35.0,0.004670,41891.0,
29368,28479.0,2024-01-31 00:00:00+00:00,1.000000,24.15,89.5500,1,2024,Tangail,10.0,0.003868,41909.0,


In [80]:
labelled_df = df[df["Flood"].notna()].copy()

In [81]:
# Extract where df flood is none:
unlabelled_df = df[df["Flood"].isna()].copy()

In [33]:
unlabelled_df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude,Month,Year,Station_Names,ALT,dist_to_water,Station_Number,Flood
1,890.0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.6600,12,1949,Bhola,5.0,0.021247,41951.0,
3,2670.0,1949-12-31 00:00:00+00:00,0.000000,23.26,90.6700,12,1949,Chandpur,7.0,0.058314,41941.0,
4,3560.0,1949-12-31 00:00:00+00:00,0.000000,22.35,91.8166,12,1949,Chittagong (City-Ambagan),0.0,0.029867,41977.0,
8,7120.0,1949-12-31 00:00:00+00:00,0.000000,23.78,90.3900,12,1949,Dhaka,9.0,0.046219,41923.0,
11,9790.0,1949-12-31 00:00:00+00:00,0.000000,23.01,91.3700,12,1949,Feni,8.0,0.014935,41943.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
29365,25809.0,2024-01-31 00:00:00+00:00,13.700002,22.64,91.6400,1,2024,Sitakunda,4.0,0.002559,41965.0,
29366,26699.0,2024-01-31 00:00:00+00:00,0.800000,24.29,91.7300,1,2024,Srimangal,23.0,0.013079,41915.0,
29367,27589.0,2024-01-31 00:00:00+00:00,0.200000,24.88,91.9300,1,2024,Sylhet,35.0,0.004670,41891.0,
29368,28479.0,2024-01-31 00:00:00+00:00,1.000000,24.15,89.5500,1,2024,Tangail,10.0,0.003868,41909.0,


In [82]:
labelled_df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude,Month,Year,Station_Names,ALT,dist_to_water,Station_Number,Flood
0,0.0,1949-12-31 00:00:00+00:00,0.0,22.70,90.36,12,1949,Barisal,4.0,0.003950,41950.0,0.0
2,1780.0,1949-12-31 00:00:00+00:00,0.0,24.88,89.36,12,1949,Bogra,20.0,0.003550,41883.0,0.0
5,4450.0,1949-12-31 00:00:00+00:00,0.0,22.34,91.79,12,1949,Chittagong (IAP-Patenga),6.0,0.034664,41978.0,0.0
6,5340.0,1949-12-31 00:00:00+00:00,0.0,23.48,91.19,12,1949,Comilla,10.0,0.001917,41933.0,0.0
7,6230.0,1949-12-31 00:00:00+00:00,0.0,21.46,91.98,12,1949,Cox's Bazar,4.0,0.002844,41992.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
25372,25688.0,2013-12-31 00:00:00+00:00,0.0,22.64,91.64,12,2013,Sitakunda,4.0,0.002559,41965.0,0.0
25373,26578.0,2013-12-31 00:00:00+00:00,0.8,24.29,91.73,12,2013,Srimangal,23.0,0.013079,41915.0,0.0
25374,27468.0,2013-12-31 00:00:00+00:00,2.8,24.88,91.93,12,2013,Sylhet,35.0,0.004670,41891.0,0.0
25375,28358.0,2013-12-31 00:00:00+00:00,0.4,24.15,89.55,12,2013,Tangail,10.0,0.003868,41909.0,0.0


In [83]:
labelled_df["rain_latitude"] = labelled_df["rain"] * labelled_df["latitude"]
labelled_df["rain_longitude"] = labelled_df["rain"] * labelled_df["longitude"]

In [84]:
labelled_df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude,Month,Year,Station_Names,ALT,dist_to_water,Station_Number,Flood,rain_latitude,rain_longitude
0,0.0,1949-12-31 00:00:00+00:00,0.0,22.70,90.36,12,1949,Barisal,4.0,0.003950,41950.0,0.0,0.000000,0.000000
2,1780.0,1949-12-31 00:00:00+00:00,0.0,24.88,89.36,12,1949,Bogra,20.0,0.003550,41883.0,0.0,0.000000,0.000000
5,4450.0,1949-12-31 00:00:00+00:00,0.0,22.34,91.79,12,1949,Chittagong (IAP-Patenga),6.0,0.034664,41978.0,0.0,0.000000,0.000000
6,5340.0,1949-12-31 00:00:00+00:00,0.0,23.48,91.19,12,1949,Comilla,10.0,0.001917,41933.0,0.0,0.000000,0.000000
7,6230.0,1949-12-31 00:00:00+00:00,0.0,21.46,91.98,12,1949,Cox's Bazar,4.0,0.002844,41992.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25372,25688.0,2013-12-31 00:00:00+00:00,0.0,22.64,91.64,12,2013,Sitakunda,4.0,0.002559,41965.0,0.0,0.000000,0.000000
25373,26578.0,2013-12-31 00:00:00+00:00,0.8,24.29,91.73,12,2013,Srimangal,23.0,0.013079,41915.0,0.0,19.432000,73.384000
25374,27468.0,2013-12-31 00:00:00+00:00,2.8,24.88,91.93,12,2013,Sylhet,35.0,0.004670,41891.0,0.0,69.664000,257.404000
25375,28358.0,2013-12-31 00:00:00+00:00,0.4,24.15,89.55,12,2013,Tangail,10.0,0.003868,41909.0,0.0,9.660000,35.820000


In [85]:
# Load data

label = "Flood"
train_data = labelled_df[
    [
        "rain",
        "Station_Number",
        "ALT",
        "rain_latitude",
        "rain_longitude",
        "dist_to_water",
        "Flood",
    ]
]

# Split the data into train and validation sets
train, val = train_test_split(
    train_data, test_size=0.1, stratify=train_data[label], random_state=42
)
train = TabularDataset(train)
val = TabularDataset(val)
# Run AutoML with stratified cross-validation
predictor = TabularPredictor(
    label=label, eval_metric="f1", sample_weight="auto_weight"
).fit(train_data, presets=["medium_quality", "optimize_for_deployment"])

# Get the leaderboard of models
predictor.leaderboard(val)

No path specified. Models will be saved in: "AutogluonModels\ag-20240527_124543"
Presets specified: ['medium_quality', 'optimize_for_deployment']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240527_124543"
AutoGluon Version:  1.1.0
Python Version:     3.11.5
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          8
Memory Avail:       2.12 GB / 15.85 GB (13.4%)
Disk Space Avail:   95.84 GB / 475.53 GB (20.2%)
Train Data Rows:    20292
Train Data Columns: 6
Label Column:       Flood
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0.0, 1.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping: 

	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 6 | ['rain', 'Station_Number', 'ALT', 'rain_latitude', 'rain_longitude', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) : 6 | ['rain', 'Station_Number', 'ALT', 'rain_latitude', 'rain_longitude', ...]
	0.1s = Fit runtime
	6 features in original data used to generate 6 features in processed data.
	Train Data (Processed) Memory Usage: 0.93 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.15s ...
AutoGluon

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.702703,0.750894,f1,0.116825,0.17266,136.998566,0.004999,0.003999,1.144157,2,True,3
1,NeuralNetTorch,0.70184,0.746163,f1,0.030175,0.037045,84.425326,0.030175,0.037045,84.425326,1,True,2
2,NeuralNetFastAI,0.694894,0.746411,f1,0.081651,0.131615,51.429084,0.081651,0.131615,51.429084,1,True,1


In [35]:
unlabelled_df.rename(columns={"rain": "Rainfall"}, inplace=True)
unlabelled_df["rain_latitude"] = unlabelled_df["Rainfall"] * unlabelled_df["latitude"]
unlabelled_df["rain_longitude"] = unlabelled_df["Rainfall"] * unlabelled_df["longitude"]
to_label_df = unlabelled_df[
    [
        "Rainfall",
        "Station_Number",
        "ALT",
        "rain_latitude",
        "rain_longitude",
        "dist_to_water",
        "Flood",
        "Month",
        "Year",
    ]
]
to_label_df

Unnamed: 0,Rainfall,Station_Number,ALT,rain_latitude,rain_longitude,dist_to_water,Flood,Month,Year
1,0.000000,41951.0,5.0,0.000000,0.000000,0.021247,,12,1949
3,0.000000,41941.0,7.0,0.000000,0.000000,0.058314,,12,1949
4,0.000000,41977.0,0.0,0.000000,0.000000,0.029867,,12,1949
8,0.000000,41923.0,9.0,0.000000,0.000000,0.046219,,12,1949
11,0.000000,41943.0,8.0,0.000000,0.000000,0.014935,,12,1949
...,...,...,...,...,...,...,...,...,...
29365,13.700002,41965.0,4.0,310.168045,1255.468183,0.002559,,1,2024
29366,0.800000,41915.0,23.0,19.432002,73.384009,0.013079,,1,2024
29367,0.200000,41891.0,35.0,4.976000,18.386000,0.004670,,1,2024
29368,1.000000,41909.0,10.0,24.150000,89.550000,0.003868,,1,2024


In [37]:
predict_df = to_label_df[
    [
        "Rainfall",
        "Station_Number",
        "ALT",
        "rain_latitude",
        "rain_longitude",
        "dist_to_water",
    ]
]
predict_df = TabularDataset(predict_df)
predictor = TabularPredictor.load("AutogluonModels/ag-20240527_110824")
predictions = predictor.predict(predict_df)
to_label_df.loc[:, "Flood"] = predictions
to_label_df

Unnamed: 0,Rainfall,Station_Number,ALT,rain_latitude,rain_longitude,dist_to_water,Flood,Month,Year
1,0.000000,41951.0,5.0,0.000000,0.000000,0.021247,0,12,1949
3,0.000000,41941.0,7.0,0.000000,0.000000,0.058314,0,12,1949
4,0.000000,41977.0,0.0,0.000000,0.000000,0.029867,0,12,1949
8,0.000000,41923.0,9.0,0.000000,0.000000,0.046219,0,12,1949
11,0.000000,41943.0,8.0,0.000000,0.000000,0.014935,0,12,1949
...,...,...,...,...,...,...,...,...,...
29365,13.700002,41965.0,4.0,310.168045,1255.468183,0.002559,0,1,2024
29366,0.800000,41915.0,23.0,19.432002,73.384009,0.013079,0,1,2024
29367,0.200000,41891.0,35.0,4.976000,18.386000,0.004670,0,1,2024
29368,1.000000,41909.0,10.0,24.150000,89.550000,0.003868,0,1,2024


In [38]:
to_label_df["Flood"].value_counts()

Flood
0    7447
1    1631
Name: count, dtype: int64

In [40]:
df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude,Month,Year,Station_Names,ALT,dist_to_water,Station_Number,Flood
0,0.0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.3600,12,1949,Barisal,4.0,0.003950,41950.0,0.0
1,890.0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.6600,12,1949,Bhola,5.0,0.021247,41951.0,
2,1780.0,1949-12-31 00:00:00+00:00,0.000000,24.88,89.3600,12,1949,Bogra,20.0,0.003550,41883.0,0.0
3,2670.0,1949-12-31 00:00:00+00:00,0.000000,23.26,90.6700,12,1949,Chandpur,7.0,0.058314,41941.0,
4,3560.0,1949-12-31 00:00:00+00:00,0.000000,22.35,91.8166,12,1949,Chittagong (City-Ambagan),0.0,0.029867,41977.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
29365,25809.0,2024-01-31 00:00:00+00:00,13.700002,22.64,91.6400,1,2024,Sitakunda,4.0,0.002559,41965.0,
29366,26699.0,2024-01-31 00:00:00+00:00,0.800000,24.29,91.7300,1,2024,Srimangal,23.0,0.013079,41915.0,
29367,27589.0,2024-01-31 00:00:00+00:00,0.200000,24.88,91.9300,1,2024,Sylhet,35.0,0.004670,41891.0,
29368,28479.0,2024-01-31 00:00:00+00:00,1.000000,24.15,89.5500,1,2024,Tangail,10.0,0.003868,41909.0,


In [43]:
to_label_df.index

Index([    1,     3,     4,     8,    11,    12,    13,    15,    17,    18,
       ...
       29360, 29361, 29362, 29363, 29364, 29365, 29366, 29367, 29368, 29369],
      dtype='int64', length=9078)

In [44]:
df.loc[to_label_df.index, "Flood"] = predictions

In [48]:
df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude,Month,Year,Station_Names,ALT,dist_to_water,Station_Number,Flood
0,0.0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.3600,12,1949,Barisal,4.0,0.003950,41950.0,0.0
1,890.0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.6600,12,1949,Bhola,5.0,0.021247,41951.0,0.0
2,1780.0,1949-12-31 00:00:00+00:00,0.000000,24.88,89.3600,12,1949,Bogra,20.0,0.003550,41883.0,0.0
3,2670.0,1949-12-31 00:00:00+00:00,0.000000,23.26,90.6700,12,1949,Chandpur,7.0,0.058314,41941.0,0.0
4,3560.0,1949-12-31 00:00:00+00:00,0.000000,22.35,91.8166,12,1949,Chittagong (City-Ambagan),0.0,0.029867,41977.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
29365,25809.0,2024-01-31 00:00:00+00:00,13.700002,22.64,91.6400,1,2024,Sitakunda,4.0,0.002559,41965.0,0.0
29366,26699.0,2024-01-31 00:00:00+00:00,0.800000,24.29,91.7300,1,2024,Srimangal,23.0,0.013079,41915.0,0.0
29367,27589.0,2024-01-31 00:00:00+00:00,0.200000,24.88,91.9300,1,2024,Sylhet,35.0,0.004670,41891.0,0.0
29368,28479.0,2024-01-31 00:00:00+00:00,1.000000,24.15,89.5500,1,2024,Tangail,10.0,0.003868,41909.0,0.0


In [50]:
df = df.copy()
df["rain_latitude"] = df["rain"] * df["latitude"]
df["rain_longitude"] = df["rain"] * df["longitude"]

In [51]:
df

Unnamed: 0.1,Unnamed: 0,date,rain,latitude,longitude,Month,Year,Station_Names,ALT,dist_to_water,Station_Number,Flood,rain_latitude,rain_longitude
0,0.0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.3600,12,1949,Barisal,4.0,0.003950,41950.0,0.0,0.000000,0.000000
1,890.0,1949-12-31 00:00:00+00:00,0.000000,22.70,90.6600,12,1949,Bhola,5.0,0.021247,41951.0,0.0,0.000000,0.000000
2,1780.0,1949-12-31 00:00:00+00:00,0.000000,24.88,89.3600,12,1949,Bogra,20.0,0.003550,41883.0,0.0,0.000000,0.000000
3,2670.0,1949-12-31 00:00:00+00:00,0.000000,23.26,90.6700,12,1949,Chandpur,7.0,0.058314,41941.0,0.0,0.000000,0.000000
4,3560.0,1949-12-31 00:00:00+00:00,0.000000,22.35,91.8166,12,1949,Chittagong (City-Ambagan),0.0,0.029867,41977.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29365,25809.0,2024-01-31 00:00:00+00:00,13.700002,22.64,91.6400,1,2024,Sitakunda,4.0,0.002559,41965.0,0.0,310.168045,1255.468183
29366,26699.0,2024-01-31 00:00:00+00:00,0.800000,24.29,91.7300,1,2024,Srimangal,23.0,0.013079,41915.0,0.0,19.432002,73.384009
29367,27589.0,2024-01-31 00:00:00+00:00,0.200000,24.88,91.9300,1,2024,Sylhet,35.0,0.004670,41891.0,0.0,4.976000,18.386000
29368,28479.0,2024-01-31 00:00:00+00:00,1.000000,24.15,89.5500,1,2024,Tangail,10.0,0.003868,41909.0,0.0,24.150000,89.550000


In [53]:
supplementary_df = df[
    [
        "rain",
        "Station_Number",
        "ALT",
        "rain_latitude",
        "rain_longitude",
        "dist_to_water",
        "Flood",
    ]
]

In [60]:
# Split the data
supplementary_modelling_data = supplementary_df.copy()

train, test = train_test_split(
    supplementary_modelling_data,
    test_size=0.1,
    random_state=42,
    stratify=supplementary_modelling_data["Flood"],
)

In [64]:
# Save the data
train.to_csv(config["data"]["train_data_v2"], index=False)
test.to_csv(config["data"]["test_data_v2"], index=False)