# Feature Engineering
In this notebook, the features to model are created.

## Import Libraries

In [19]:
# Set configuration for notebook
import os

os.chdir('c:\\Users\\Spectra\\flood-prediction')
os.getcwd()

'c:\\Users\\Spectra\\flood-prediction'

In [87]:
import pandas as pd
import numpy as np
from src.utils import load_config
from sklearn.feature_selection import RFE 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [21]:
# Load the config
config = load_config()


# Load the data

In [22]:
df = pd.read_csv(config["data"]["processed_data_path"])

In [81]:
df.columns

Index(['Station_Names', 'Year', 'Month', 'Max_Temp', 'Min_Temp', 'Rainfall',
       'Relative_Humidity', 'Wind_Speed', 'Cloud_Coverage', 'Bright_Sunshine',
       'Station_Number', 'LATITUDE', 'LONGITUDE', 'ALT', 'Period', 'Flood',
       'Avg_Temp', 'rain_latitude', 'rain_longitude'],
      dtype='object')

## Feature Engineering

**Drop highly correlated columns**

In [24]:
df.drop(["X_COR", "Y_COR", "Sl"], axis=1, inplace=True)

**Create Average Temperature**


In [25]:
df["Avg_Temp"] = (df["Max_Temp"] + df["Min_Temp"]) / 2

**Create spatial features**


In [28]:
# Create interaction features
df['rain_latitude'] = df['Rainfall'] * df['LATITUDE']
df['rain_longitude'] = df['Rainfall'] * df['LONGITUDE']

**Create distance to waterbodies feature**

In [26]:
station_df = df[['Station_Names', 'LATITUDE', 'LONGITUDE']].drop_duplicates()

In [29]:
import geopandas as gpd


# Assuming df is your DataFrame with latitude and longitude columns
# Create GeoDataFrame from DataFrame
gdf = gpd.GeoDataFrame(station_df, geometry=gpd.points_from_xy(station_df.LONGITUDE, station_df.LATITUDE))

# Load water body GeoDataFrame from OpenStreetMap
try:
    water_bodies = gpd.read_file("data/raw/bgd_hyd_rivers_lged.shp")
except Exception as e:
    # If there's an error reading the shapefile, attempt to recreate the index file
    water_bodies = gpd.read_file("data/raw/bgd_hyd_rivers_lged.shp")
    # Create spatial index
    water_bodies.sindex

# Calculate distances to nearest water body
gdf['dist_to_water'] = gdf.geometry.apply(lambda x: water_bodies.distance(x).min())

# Assign the calculated distances back to the original DataFrame
station_df['dist_to_water'] = gdf['dist_to_water']


In [30]:
station_df

Unnamed: 0,Station_Names,LATITUDE,LONGITUDE,dist_to_water
0,Barisal,22.7,90.36,0.00395
780,Bhola,22.7,90.66,0.021247
1356,Bogra,24.88,89.36,0.00355
2148,Chandpur,23.26,90.67,0.058314
2748,Chittagong (City-Ambagan),22.35,91.8166,0.029867
2796,Chittagong (IAP-Patenga),22.34,91.79,0.034664
3576,Comilla,23.48,91.19,0.001917
4368,Cox's Bazar,21.46,91.98,0.002844
5160,Dhaka,23.78,90.39,0.046219
5892,Dinajpur,25.63,88.66,0.011919


In [103]:
# Merge this to the original df
merged_df = pd.merge(df, station_df, on="Station_Names", how="left")
merged_df.drop(["LATITUDE_x", "LONGITUDE_x"], axis=1, inplace=True)

In [104]:
merged_df

Unnamed: 0,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,ALT,Period,Flood,Avg_Temp,rain_latitude,rain_longitude,LATITUDE_y,LONGITUDE_y,dist_to_water
0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,4,1949.01,0,20.85,0.00,0.00,22.70,90.36,0.003950
1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,4,1949.02,0,24.55,204.30,813.24,22.70,90.36,0.003950
2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,4,1949.03,0,28.45,181.60,722.88,22.70,90.36,0.003950
3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,4,1949.04,0,28.90,3178.00,12650.40,22.70,90.36,0.003950
4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,4,1949.05,0,30.30,4925.90,19608.12,22.70,90.36,0.003950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,4,2013.08,1,28.95,17760.37,78513.26,20.87,92.26,0.001473
20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,4,2013.09,0,29.15,6866.23,30353.54,20.87,92.26,0.001473
20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,4,2013.10,0,28.95,5655.77,25002.46,20.87,92.26,0.001473
20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,4,2013.11,0,26.25,0.00,0.00,20.87,92.26,0.001473


In [127]:
station_df = merged_df[['Station_Names', 'LATITUDE_y', 'LONGITUDE_y', 'ALT', 'dist_to_water', 'Station_Number']].drop_duplicates()
station_df

Unnamed: 0,Station_Names,LATITUDE_y,LONGITUDE_y,ALT,dist_to_water,Station_Number
0,Barisal,22.7,90.36,4,0.00395,41950
780,Bhola,22.7,90.66,5,0.021247,41951
1356,Bogra,24.88,89.36,20,0.00355,41883
2148,Chandpur,23.26,90.67,7,0.058314,41941
2748,Chittagong (City-Ambagan),22.35,91.8166,0,0.029867,41977
2796,Chittagong (IAP-Patenga),22.34,91.79,6,0.034664,41978
3576,Comilla,23.48,91.19,10,0.001917,41933
4368,Cox's Bazar,21.46,91.98,4,0.002844,41992
5160,Dhaka,23.78,90.39,9,0.046219,41923
5892,Dinajpur,25.63,88.66,37,0.011919,41863


In [128]:
station_df.to_csv("data/reference_data/reference.csv", index=False)

**Get river discharge data**

In [42]:
river_discharge_df = pd.read_csv("data/raw/river_discharge.csv")

In [43]:
river_discharge_df

Unnamed: 0,date,river_discharge,LATITUDE,LONGITUDE,Station_Names
0,1985-01-31 00:00:00+00:00,0.217383,22.70,90.36,Barisal
1,1985-02-28 00:00:00+00:00,0.186330,22.70,90.36,Barisal
2,1985-03-31 00:00:00+00:00,0.261097,22.70,90.36,Barisal
3,1985-04-30 00:00:00+00:00,0.240555,22.70,90.36,Barisal
4,1985-05-31 00:00:00+00:00,0.846616,22.70,90.36,Barisal
...,...,...,...,...,...
11512,2013-09-30 00:00:00+00:00,3.317700,20.87,92.26,Teknaf
11513,2013-10-31 00:00:00+00:00,2.120073,20.87,92.26,Teknaf
11514,2013-11-30 00:00:00+00:00,0.810497,20.87,92.26,Teknaf
11515,2013-12-31 00:00:00+00:00,0.285147,20.87,92.26,Teknaf


In [46]:
# Extract year and month from date
river_discharge_df['date'] = pd.to_datetime(river_discharge_df['date'])
river_discharge_df['Month'] = river_discharge_df['date'].dt.month
river_discharge_df['Year'] = river_discharge_df['date'].dt.year

In [47]:
river_discharge_df

Unnamed: 0,date,river_discharge,LATITUDE,LONGITUDE,Station_Names,Month,Year
0,1985-01-31 00:00:00+00:00,0.217383,22.70,90.36,Barisal,1,1985
1,1985-02-28 00:00:00+00:00,0.186330,22.70,90.36,Barisal,2,1985
2,1985-03-31 00:00:00+00:00,0.261097,22.70,90.36,Barisal,3,1985
3,1985-04-30 00:00:00+00:00,0.240555,22.70,90.36,Barisal,4,1985
4,1985-05-31 00:00:00+00:00,0.846616,22.70,90.36,Barisal,5,1985
...,...,...,...,...,...,...,...
11512,2013-09-30 00:00:00+00:00,3.317700,20.87,92.26,Teknaf,9,2013
11513,2013-10-31 00:00:00+00:00,2.120073,20.87,92.26,Teknaf,10,2013
11514,2013-11-30 00:00:00+00:00,0.810497,20.87,92.26,Teknaf,11,2013
11515,2013-12-31 00:00:00+00:00,0.285147,20.87,92.26,Teknaf,12,2013


In [52]:
# Get data from 1985
from_1985_df = merged_df.loc[merged_df["Year"] >= 1985]
from_1985_df

Unnamed: 0,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,ALT,Period,Flood,Avg_Temp,rain_latitude,rain_longitude,dist_to_water
432,Barisal,1985,1,30.2,12.6,1.0,84.0,0.2,1.4,7.5,41950,4,1985.01,0,21.40,22.70,90.36,0.003950
433,Barisal,1985,2,31.4,14.3,2.0,77.0,0.6,1.1,8.6,41950,4,1985.02,0,22.85,45.40,180.72,0.003950
434,Barisal,1985,3,36.7,22.4,45.0,81.0,1.8,3.6,7.5,41950,4,1985.03,0,29.55,1021.50,4066.20,0.003950
435,Barisal,1985,4,36.6,24.6,64.0,81.0,1.9,4.7,8.4,41950,4,1985.04,0,30.60,1452.80,5783.04,0.003950
436,Barisal,1985,5,35.0,24.2,226.0,88.0,1.7,5.1,7.9,41950,4,1985.05,1,29.60,5130.20,20421.36,0.003950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.5,6.2,3.8,41998,4,2013.08,1,28.95,17760.37,78513.26,0.001473
20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.0,6.1,4.2,41998,4,2013.09,0,29.15,6866.23,30353.54,0.001473
20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.5,4.4,5.6,41998,4,2013.10,0,28.95,5655.77,25002.46,0.001473
20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.1,1.7,8.4,41998,4,2013.11,0,26.25,0.00,0.00,0.001473


In [53]:
# Merge the river discharge with the from 1985 df
full_merged_df = pd.merge(from_1985_df, river_discharge_df, on=["Month", "Year", "Station_Names"], how='inner')
full_merged_df

Unnamed: 0,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,...,Period,Flood,Avg_Temp,rain_latitude,rain_longitude,dist_to_water,date,river_discharge,LATITUDE,LONGITUDE
0,Barisal,1985,1,30.2,12.6,1.0,84.0,0.2,1.4,7.5,...,1985.01,0,21.40,22.70,90.36,0.003950,1985-01-31 00:00:00+00:00,0.217383,22.70,90.36
1,Barisal,1985,2,31.4,14.3,2.0,77.0,0.6,1.1,8.6,...,1985.02,0,22.85,45.40,180.72,0.003950,1985-02-28 00:00:00+00:00,0.186330,22.70,90.36
2,Barisal,1985,3,36.7,22.4,45.0,81.0,1.8,3.6,7.5,...,1985.03,0,29.55,1021.50,4066.20,0.003950,1985-03-31 00:00:00+00:00,0.261097,22.70,90.36
3,Barisal,1985,4,36.6,24.6,64.0,81.0,1.9,4.7,8.4,...,1985.04,0,30.60,1452.80,5783.04,0.003950,1985-04-30 00:00:00+00:00,0.240555,22.70,90.36
4,Barisal,1985,5,35.0,24.2,226.0,88.0,1.7,5.1,7.9,...,1985.05,1,29.60,5130.20,20421.36,0.003950,1985-05-31 00:00:00+00:00,0.846616,22.70,90.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11107,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.5,6.2,3.8,...,2013.08,1,28.95,17760.37,78513.26,0.001473,2013-08-31 00:00:00+00:00,7.071018,20.87,92.26
11108,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.0,6.1,4.2,...,2013.09,0,29.15,6866.23,30353.54,0.001473,2013-09-30 00:00:00+00:00,3.317700,20.87,92.26
11109,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.5,4.4,5.6,...,2013.10,0,28.95,5655.77,25002.46,0.001473,2013-10-31 00:00:00+00:00,2.120073,20.87,92.26
11110,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.1,1.7,8.4,...,2013.11,0,26.25,0.00,0.00,0.001473,2013-11-30 00:00:00+00:00,0.810497,20.87,92.26


In [55]:
merged_df

Unnamed: 0,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,ALT,Period,Flood,Avg_Temp,rain_latitude,rain_longitude,dist_to_water
0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,4,1949.01,0,20.85,0.00,0.00,0.003950
1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,4,1949.02,0,24.55,204.30,813.24,0.003950
2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,4,1949.03,0,28.45,181.60,722.88,0.003950
3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,4,1949.04,0,28.90,3178.00,12650.40,0.003950
4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,4,1949.05,0,30.30,4925.90,19608.12,0.003950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,4,2013.08,1,28.95,17760.37,78513.26,0.001473
20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,4,2013.09,0,29.15,6866.23,30353.54,0.001473
20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,4,2013.10,0,28.95,5655.77,25002.46,0.001473
20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,4,2013.11,0,26.25,0.00,0.00,0.001473


**Select the best features**

In [88]:
# Separate features (X) and target variable (y)
X = full_merged_df.drop(columns=["Flood", "Station_Names", "date", "Period"])  # Features are all columns except "Flood"
y = full_merged_df["Flood"]  # Target variable is "Flood"

# Initialize a classifier 
clf = XGBClassifier()

# Initialize RFE with the classifier and number of desired features
rfe = RFE(estimator=clf, n_features_to_select=10)  

# Fit RFE to your full_merged_df
rfe.fit(X, y)

# Get the selected features
selected_features = X.columns[rfe.support_]

print("Selected Features:")
print(selected_features)

Selected Features:
Index(['Month', 'Rainfall', 'Relative_Humidity', 'Station_Number', 'ALT',
       'rain_latitude', 'rain_longitude', 'dist_to_water', 'river_discharge',
       'LONGITUDE'],
      dtype='object')


In [95]:
train_features_set = ['Rainfall', 'Cloud_Coverage', 'Bright_Sunshine','Station_Number', "Flood"] # 95
train_features_set_one =  ['Rainfall', 'Max_Temp', 'Min_Temp', 'Relative_Humidity', 'Wind_Speed', 'Month', 'Flood'] # 96
train_features_set_two = ['Min_Temp', 'Rainfall',
       'Cloud_Coverage', 'Station_Number', 'ALT', 'rain_latitude',
       'rain_longitude', 'dist_to_water', 'Flood'] # NeuralNetFastAI 96, XGBoost 96
train_features_set_three = ['Rainfall', 'Relative_Humidity', 'Cloud_Coverage', 'Station_Number',
       'Min_Temp', 'rain_latitude', 'rain_longitude', 'dist_to_water',
       'river_discharge', 'Flood'] # Doesn't help as much 
train_features_set_four = ['Month', 'Rainfall', 'Relative_Humidity', 'Station_Number', 'ALT',
       'rain_latitude', 'rain_longitude', 'dist_to_water', 'river_discharge',
       'Flood']

We can now train the model using the set of features obtained

## Split the data

In [115]:
# Split the data
modelling_data = merged_df[train_features_set_two]

train, test = train_test_split(modelling_data,
                               test_size=0.2,
                               random_state=42,
                               stratify=modelling_data["Flood"])

In [116]:
# Save the data
train.to_csv(config["data"]["train_data_path"], index=False)
test.to_csv(config["data"]["test_data_path"], index=False)