# Feature Engineering
In this notebook, the features to model are created.

## Import Libraries

In [2]:
# Set configuration for notebook
import os

os.chdir('c:\\Users\\Spectra\\flood-prediction')
os.getcwd()

'c:\\Users\\Spectra\\flood-prediction'

In [22]:
import pandas as pd
import numpy as np
from src.utils import load_config
from sklearn.feature_selection import RFE 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [27]:
# Load the config
config = load_config()


# Load the data

In [5]:
df = pd.read_csv(config["data"]["processed_data_path"])

In [6]:
df

Unnamed: 0,Sl,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,X_COR,Y_COR,LATITUDE,LONGITUDE,ALT,Period,Flood
0,0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,536809.8,510151.9,22.70,90.36,4,1949.01,0
1,1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,536809.8,510151.9,22.70,90.36,4,1949.02,0
2,2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,536809.8,510151.9,22.70,90.36,4,1949.03,0
3,3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,536809.8,510151.9,22.70,90.36,4,1949.04,0
4,4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,536809.8,510151.9,22.70,90.36,4,1949.05,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,734765.4,308914.1,20.87,92.26,4,2013.08,1
20540,20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,734765.4,308914.1,20.87,92.26,4,2013.09,0
20541,20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,734765.4,308914.1,20.87,92.26,4,2013.10,0
20542,20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,734765.4,308914.1,20.87,92.26,4,2013.11,0


## Feature Engineering

**Drop highly correlated columns**

In [9]:
df.drop(["X_COR", "Y_COR", "Sl"], axis=1, inplace=True)

**Create Average Temperature**


In [10]:
df["Avg_Temp"] = (df["Max_Temp"] + df["Min_Temp"]) / 2

In [11]:
df

Unnamed: 0,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,LATITUDE,LONGITUDE,ALT,Period,Flood,Avg_Temp
0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,22.70,90.36,4,1949.01,0,20.85
1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,22.70,90.36,4,1949.02,0,24.55
2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,22.70,90.36,4,1949.03,0,28.45
3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,22.70,90.36,4,1949.04,0,28.90
4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,22.70,90.36,4,1949.05,0,30.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,20.87,92.26,4,2013.08,1,28.95
20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,20.87,92.26,4,2013.09,0,29.15
20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,20.87,92.26,4,2013.10,0,28.95
20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,20.87,92.26,4,2013.11,0,26.25


**Select the best features**

In [19]:
# Separate features (X) and target variable (y)
X = df.drop(columns=["Flood", "Station_Names", "Period"])  # Features are all columns except "Flood"
y = df["Flood"]  # Target variable is "Flood"

# Initialize a classifier 
clf = RandomForestClassifier()

# Initialize RFE with the classifier and number of desired features
rfe = RFE(estimator=clf, n_features_to_select=6)  

# Fit RFE to your df
rfe.fit(X, y)

# Get the selected features
selected_features = X.columns[rfe.support_]

print("Selected Features:")
print(selected_features)

Selected Features:
Index(['Rainfall', 'Relative_Humidity', 'Cloud_Coverage', 'Bright_Sunshine',
       'Station_Number', 'LONGITUDE'],
      dtype='object')


In [25]:
train_features_set = ['Rainfall', 'Relative_Humidity', 'Cloud_Coverage', 'Bright_Sunshine','Station_Number', "Flood"]


We can now train the model using the set of features obtained

## Split the data

In [26]:
# Split the data
modelling_data = df[train_features_set]

train, test = train_test_split(modelling_data,
                               test_size=0.2,
                               random_state=42,
                               stratify=modelling_data["Flood"])

In [28]:
# Save the data
train.to_csv(config["data"]["train_data_path"], index=False)
test.to_csv(config["data"]["test_data_path"], index=False)