# Feature Engineering
In this notebook, the features to model are created.

## Import Libraries

In [1]:
# Set configuration for notebook
import os

os.chdir('c:\\Users\\Spectra\\flood-prediction')
os.getcwd()

'c:\\Users\\Spectra\\flood-prediction'

In [2]:
import pandas as pd
import numpy as np
from src.utils import load_config
from sklearn.feature_selection import RFE 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [3]:
# Load the config
config = load_config()


# Load the data

In [4]:
df = pd.read_csv(config["data"]["processed_data_path"])

In [5]:
df

Unnamed: 0,Sl,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,X_COR,Y_COR,LATITUDE,LONGITUDE,ALT,Period,Flood
0,0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,536809.8,510151.9,22.70,90.36,4,1949.01,0
1,1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,536809.8,510151.9,22.70,90.36,4,1949.02,0
2,2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,536809.8,510151.9,22.70,90.36,4,1949.03,0
3,3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,536809.8,510151.9,22.70,90.36,4,1949.04,0
4,4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,536809.8,510151.9,22.70,90.36,4,1949.05,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,734765.4,308914.1,20.87,92.26,4,2013.08,1
20540,20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,734765.4,308914.1,20.87,92.26,4,2013.09,0
20541,20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,734765.4,308914.1,20.87,92.26,4,2013.10,0
20542,20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,734765.4,308914.1,20.87,92.26,4,2013.11,0


## Feature Engineering

**Drop highly correlated columns**

In [6]:
df.drop(["X_COR", "Y_COR", "Sl"], axis=1, inplace=True)

**Create Average Temperature**


In [7]:
df["Avg_Temp"] = (df["Max_Temp"] + df["Min_Temp"]) / 2

**Create spatial features**


In [8]:
from pykrige.ok import OrdinaryKriging

# Assuming df contains latitude, longitude, and rainfall columns
longitudes = df['LONGITUDE'].values
latitudes = df['LATITUDE'].values
values = df['Rainfall'].values

OK = OrdinaryKriging(longitudes, latitudes, values,
                     variogram_model='spherical',
                     verbose=False,
                     enable_plotting=False)

# Interpolate rainfall using Ordinary Kriging
df['rainfall_interpolated'], _ = OK.execute('points',longitudes, latitudes)

In [9]:
# Create interaction features
df['rain_latitude'] = df['Rainfall'] * df['LATITUDE']
df['rain_longitude'] = df['Rainfall'] * df['LONGITUDE']

**Create distance to waterbodies feature**

In [18]:
import geopandas as gpd


# Assuming df is your DataFrame with latitude and longitude columns
# Create GeoDataFrame from DataFrame
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.LONGITUDE, df.LATITUDE))

# Load water body GeoDataFrame from OpenStreetMap
try:
    water_bodies = gpd.read_file("data/raw/bgd_hyd_rivers_lged.shp")
except Exception as e:
    # If there's an error reading the shapefile, attempt to recreate the index file
    water_bodies = gpd.read_file("data/raw/bgd_hyd_rivers_lged.shp")
    # Create spatial index
    water_bodies.sindex

# Calculate distances to nearest water body
gdf['dist_to_water'] = gdf.geometry.apply(lambda x: water_bodies.distance(x).min())

# Assign the calculated distances back to the original DataFrame
df['dist_to_water'] = gdf['dist_to_water']


KeyboardInterrupt: 

In [8]:
df

Unnamed: 0,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,LATITUDE,LONGITUDE,ALT,Period,Flood,Avg_Temp
0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,22.70,90.36,4,1949.01,0,20.85
1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,22.70,90.36,4,1949.02,0,24.55
2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,22.70,90.36,4,1949.03,0,28.45
3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,22.70,90.36,4,1949.04,0,28.90
4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,22.70,90.36,4,1949.05,0,30.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,20.87,92.26,4,2013.08,1,28.95
20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,20.87,92.26,4,2013.09,0,29.15
20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,20.87,92.26,4,2013.10,0,28.95
20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,20.87,92.26,4,2013.11,0,26.25


**Select the best features**

In [12]:
# Separate features (X) and target variable (y)
X = df.drop(columns=["Flood", "Station_Names", "Period"])  # Features are all columns except "Flood"
y = df["Flood"]  # Target variable is "Flood"

# Initialize a classifier 
clf = RandomForestClassifier()

# Initialize RFE with the classifier and number of desired features
rfe = RFE(estimator=clf, n_features_to_select=4)  

# Fit RFE to your df
rfe.fit(X, y)

# Get the selected features
selected_features = X.columns[rfe.support_]

print("Selected Features:")
print(selected_features)

Selected Features:
Index(['Rainfall', 'Cloud_Coverage', 'Bright_Sunshine', 'Station_Number'], dtype='object')


In [18]:
train_features_set = ['Rainfall', 'Cloud_Coverage', 'Bright_Sunshine','Station_Number', "Flood"]
train_features_set_one =  ['Rainfall', 'Max_Temp', 'Min_Temp', 'Relative_Humidity', 'Wind_Speed', 'Month', 'Flood'] 

We can now train the model using the set of features obtained

## Split the data

In [19]:
# Split the data
modelling_data = df[train_features_set_one]

train, test = train_test_split(modelling_data,
                               test_size=0.2,
                               random_state=42,
                               stratify=modelling_data["Flood"])

In [15]:
# Save the data
train.to_csv(config["data"]["train_data_path"], index=False)
test.to_csv(config["data"]["test_data_path"], index=False)