In [4]:
import numpy as np
import pandas as pd
import joblib
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from predicting_forest_fires.data.custom import (
    load_data_or_schema,
    optimum_k,
    hybrid_cluster_outlier_removal,
    embedded_feature_selection,
    filter_feature_selection,
    binary_encoding,
    geographical_encoding,
    feature_binning,
    preprocess_training_set,
    preprocess_test_set
)
from predicting_forest_fires.config.config import (
    RAW_DATA,
    MODEL_TRAINING_DATA,
    MODEL_VALIDATION_DATA,
    EVALUATION_TEST_DATA,
)

In [5]:
df, _ = load_data_or_schema(RAW_DATA)
df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,F
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,F
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,F
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,F
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,T
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,T
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,T
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,T


In [6]:
df_encoded = binary_encoding(df, features=["area"])
df_encoded

  X[feature] = X[feature].replace({"T": 1, "F": 0})


Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,1
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,1
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,1
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,1


In [7]:
df_geo_encoded = geographical_encoding(df_encoded, spatial_features=["X", "Y"])
df_geo_encoded

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,grid_zone
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0,7_5
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0,7_4
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0,7_4
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0,8_6
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0,8_6
...,...,...,...,...,...,...,...,...,...,...,...,...
512,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,1,4_3
513,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,1,2_4
514,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,1,7_4
515,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,1,1_4


In [8]:
df_binned = feature_binning(df_geo_encoded, features=["rain"])
df_binned

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,area,grid_zone,rain_binned
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,7_5,0
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0,7_4,0
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0,7_4,0
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0,8_6,1
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0,8_6,0
...,...,...,...,...,...,...,...,...,...,...,...,...
512,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,1,4_3,0
513,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,1,2_4,0
514,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,1,7_4,0
515,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,1,1_4,0


In [9]:
target = "area"
features = df_binned.columns[df_binned.columns != target]
X = df_binned[features]
y = df_binned[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_test

420    1
195    0
12     0
391    1
348    1
      ..
42     0
259    1
0      0
362    1
165    0
Name: area, Length: 104, dtype: int64

In [10]:
df.shape

(517, 12)

In [11]:
df_binned

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,area,grid_zone,rain_binned
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,7_5,0
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0,7_4,0
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0,7_4,0
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0,8_6,1
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0,8_6,0
...,...,...,...,...,...,...,...,...,...,...,...,...
512,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,1,4_3,0
513,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,1,2_4,0
514,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,1,7_4,0
515,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,1,1_4,0


In [12]:
df_binned.dtypes

month           object
day             object
FFMC           float64
DMC            float64
DC             float64
ISI            float64
temp           float64
RH               int64
wind           float64
area             int64
grid_zone       object
rain_binned      int64
dtype: object

In [13]:
log_transform_features = ["ISI"]
modified_log_transform_features = ["FFMC"]
numerical_features = [
    feature
    for feature in list(df_binned.select_dtypes(exclude=object).columns)
    if feature not in ["area", "rain_binned"]
]
categorical_features = list(df_binned.select_dtypes(include=object).columns)
categorical_features

['month', 'day', 'grid_zone']

In [14]:
X_train_processed, preprocessor = preprocess_training_set(
    pd.concat([X_train, y_train], axis=1),
    log_transform_features=log_transform_features,
    modified_log_transform_features=modified_log_transform_features,
    numerical_features=numerical_features,
    categorical_features=categorical_features
)
X_train_processed

Unnamed: 0,ISI_log_transformed,FFMC_modified_log_transformed,FFMC,DMC,DC,ISI,temp,RH,wind,month_apr,...,grid_zone_7_6,grid_zone_8_3,grid_zone_8_5,grid_zone_8_6,grid_zone_9_4,grid_zone_9_5,grid_zone_9_6,grid_zone_9_9,rain_binned,area
0,2.433613,4.744062,1.384615,0.343152,-0.324991,0.465116,0.197183,0.190476,-0.818182,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.079442,4.705920,-0.269231,0.266568,0.083428,-0.325581,-0.225352,-0.095238,-0.818182,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.902108,4.694096,-0.769231,-1.222386,-2.240425,-0.627907,-0.183099,-0.047619,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2.054124,4.704110,-0.346154,1.262150,-0.081532,-0.372093,-0.436620,1.047619,-0.590909,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2.944439,4.730921,0.807692,0.195876,-1.191885,2.232558,0.478873,-0.047619,2.454545,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,2.091864,4.684905,-1.153846,-0.244477,0.172165,-0.302326,-0.971831,1.523810,1.045455,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
409,1.987874,4.712229,0.000000,2.061856,0.338642,-0.488372,0.154930,0.714286,-0.818182,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
410,1.987874,4.699571,-0.538462,-0.129602,-0.126280,-0.488372,0.211268,-0.380952,-0.590909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
411,2.140066,4.711330,-0.038462,1.911635,0.250664,-0.209302,-0.225352,1.142857,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [15]:
X_train

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,grid_zone,rain_binned
169,aug,thu,95.2,131.7,578.8,10.4,20.7,45,2.2,8_6,0
173,sep,mon,90.9,126.5,686.5,7.0,17.7,39,2.2,4_4,0
471,may,fri,89.6,25.4,73.7,5.7,18.0,40,4.0,4_3,0
381,aug,thu,90.7,194.1,643.0,6.8,16.2,63,2.7,8_6,0
475,jun,thu,93.7,121.7,350.2,18.0,22.7,40,9.4,2_5,0
...,...,...,...,...,...,...,...,...,...,...,...
217,sep,mon,88.6,91.8,709.9,7.1,12.4,73,6.3,1_3,0
425,aug,thu,91.6,248.4,753.8,6.3,20.4,56,2.2,2_2,0
63,aug,sun,90.2,99.6,631.2,6.3,20.8,33,2.7,2_2,0
452,aug,mon,91.5,238.2,730.6,7.5,17.7,65,4.0,7_4,0


In [16]:
X_test

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,grid_zone,rain_binned
420,aug,wed,91.7,191.4,635.9,7.8,26.2,36,4.5,8_8,0
195,aug,fri,93.9,135.7,586.7,15.1,23.5,36,5.4,2_5,0
12,aug,fri,63.5,70.8,665.3,0.8,17.0,72,6.7,6_5,0
391,sep,fri,91.1,91.3,738.1,7.2,20.7,46,2.7,8_6,0
348,sep,fri,92.1,99.0,745.3,9.6,17.4,57,4.5,3_4,0
...,...,...,...,...,...,...,...,...,...,...,...
42,aug,tue,94.8,108.3,647.1,17.0,16.6,54,5.4,4_4,0
259,aug,sat,91.8,170.9,692.3,13.7,23.7,40,1.8,7_4,0
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,7_5,0
362,sep,fri,88.2,55.2,732.3,11.6,15.2,64,3.1,7_4,0


In [17]:
X_test_processed = preprocess_test_set(
    pd.concat([X_test, y_test], axis=1),
    preprocessor=preprocessor
)

In [18]:
X_test_processed

Unnamed: 0,ISI_log_transformed,FFMC_modified_log_transformed,FFMC,DMC,DC,ISI,temp,RH,wind,month_apr,...,grid_zone_7_6,grid_zone_8_3,grid_zone_8_5,grid_zone_8_6,grid_zone_9_4,grid_zone_9_5,grid_zone_9_6,grid_zone_9_9,rain_binned,area
0,2.174752,4.713127,0.038462,1.222386,-0.108457,-0.139535,0.971831,-0.238095,0.227273,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2.778819,4.732684,0.884615,0.402062,-0.295032,1.558140,0.591549,-0.238095,0.636364,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.587787,4.421247,-10.807692,-0.553756,0.003034,-1.767442,-0.323944,1.476190,1.227273,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.104134,4.707727,-0.192308,-0.251841,0.279105,-0.279070,0.197183,0.238095,-0.590909,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2.360854,4.716712,0.192308,-0.138439,0.306409,0.279070,-0.267606,0.761905,0.227273,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,2.890372,4.740575,1.230769,-0.001473,-0.065984,2.000000,-0.380282,0.619048,0.636364,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,2.687847,4.714025,0.076923,0.920471,0.105423,1.232558,0.619718,-0.047619,-1.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
101,1.808289,4.662495,-2.076923,-1.210604,-2.162306,-0.767442,-1.563380,0.476190,1.227273,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,2.533697,4.681205,-1.307692,-0.783505,0.257110,0.744186,-0.577465,1.095238,-0.409091,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [19]:
_, kmeans_model = optimum_k(data=X_train_processed)
X_train_outliers, _ = hybrid_cluster_outlier_removal(
    data=X_train_processed,
    kmeans_model=kmeans_model,
    dbscan_eps=3.5,
    dbscan_min_samples=2,
)

Number of data points: 413
Number of outliers detected: 5
Percentage of outliers: 1.21%


In [20]:
X_train_indices = X_train_outliers.index
X_train_trimmed = X_train_outliers.loc[X_train_indices].reset_index(drop=True)

In [21]:
X_train_processed

Unnamed: 0,ISI_log_transformed,FFMC_modified_log_transformed,FFMC,DMC,DC,ISI,temp,RH,wind,month_apr,...,grid_zone_8_5,grid_zone_8_6,grid_zone_9_4,grid_zone_9_5,grid_zone_9_6,grid_zone_9_9,rain_binned,area,Cluster,DBSCAN_Outlier
0,2.433613,4.744062,1.384615,0.343152,-0.324991,0.465116,0.197183,0.190476,-0.818182,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3,False
1,2.079442,4.705920,-0.269231,0.266568,0.083428,-0.325581,-0.225352,-0.095238,-0.818182,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,False
2,1.902108,4.694096,-0.769231,-1.222386,-2.240425,-0.627907,-0.183099,-0.047619,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6,False
3,2.054124,4.704110,-0.346154,1.262150,-0.081532,-0.372093,-0.436620,1.047619,-0.590909,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,7,False
4,2.944439,4.730921,0.807692,0.195876,-1.191885,2.232558,0.478873,-0.047619,2.454545,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,2.091864,4.684905,-1.153846,-0.244477,0.172165,-0.302326,-0.971831,1.523810,1.045455,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,False
409,1.987874,4.712229,0.000000,2.061856,0.338642,-0.488372,0.154930,0.714286,-0.818182,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7,False
410,1.987874,4.699571,-0.538462,-0.129602,-0.126280,-0.488372,0.211268,-0.380952,-0.590909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,False
411,2.140066,4.711330,-0.038462,1.911635,0.250664,-0.209302,-0.225352,1.142857,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7,False


In [22]:
X_train_trimmed

Unnamed: 0,ISI_log_transformed,FFMC_modified_log_transformed,FFMC,DMC,DC,ISI,temp,RH,wind,month_apr,...,grid_zone_7_6,grid_zone_8_3,grid_zone_8_5,grid_zone_8_6,grid_zone_9_4,grid_zone_9_5,grid_zone_9_6,grid_zone_9_9,rain_binned,area
0,2.433613,4.744062,1.384615,0.343152,-0.324991,0.465116,0.197183,0.190476,-0.818182,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.079442,4.705920,-0.269231,0.266568,0.083428,-0.325581,-0.225352,-0.095238,-0.818182,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.902108,4.694096,-0.769231,-1.222386,-2.240425,-0.627907,-0.183099,-0.047619,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2.054124,4.704110,-0.346154,1.262150,-0.081532,-0.372093,-0.436620,1.047619,-0.590909,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2.944439,4.730921,0.807692,0.195876,-1.191885,2.232558,0.478873,-0.047619,2.454545,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,2.091864,4.684905,-1.153846,-0.244477,0.172165,-0.302326,-0.971831,1.523810,1.045455,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404,1.987874,4.712229,0.000000,2.061856,0.338642,-0.488372,0.154930,0.714286,-0.818182,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
405,1.987874,4.699571,-0.538462,-0.129602,-0.126280,-0.488372,0.211268,-0.380952,-0.590909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
406,2.140066,4.711330,-0.038462,1.911635,0.250664,-0.209302,-0.225352,1.142857,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [23]:
X_train_embedded_reduced, _ = embedded_feature_selection(X_train_trimmed, target="area")
print(X_train_embedded_reduced.shape)


Important Logistic Regression Features: ['ISI_log_transformed', 'DMC', 'month_apr', 'month_aug', 'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_oct', 'month_sep', 'grid_zone_1_2', 'grid_zone_1_3', 'grid_zone_1_5', 'grid_zone_2_2', 'grid_zone_2_3', 'grid_zone_2_4', 'grid_zone_3_3', 'grid_zone_3_5', 'grid_zone_3_6', 'grid_zone_4_6', 'grid_zone_5_5', 'grid_zone_5_6', 'grid_zone_6_5', 'grid_zone_6_6', 'grid_zone_8_3', 'grid_zone_9_5', 'grid_zone_9_9', 'rain_binned']
Important Random Forest Features: ['ISI_log_transformed', 'FFMC_modified_log_transformed', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'month_apr', 'month_aug', 'month_jul', 'month_jun', 'month_mar', 'month_oct', 'month_sep', 'day_fri', 'day_mon', 'day_sat', 'day_sun', 'day_thu', 'day_tue', 'day_wed', 'grid_zone_1_2', 'grid_zone_2_2', 'grid_zone_2_4', 'grid_zone_3_4', 'grid_zone_4_4', 'grid_zone_6_3', 'grid_zone_6_5', 'grid_zone_8_6']
(408, 50)


In [24]:
X_train_filter_reduced, optimized_features = filter_feature_selection(X_train_embedded_reduced)

  vif = 1. / (1. - r_squared_i)


In [25]:
print(X_train_embedded_reduced.shape)
print(X_train_filter_reduced.shape)

(408, 50)
(408, 27)


In [26]:
l = [i for i in X_train_filter_reduced.columns]
l

['grid_zone_8_3',
 'rain_binned',
 'grid_zone_2_3',
 'grid_zone_3_6',
 'grid_zone_9_9',
 'grid_zone_1_5',
 'grid_zone_3_5',
 'grid_zone_6_6',
 'grid_zone_9_5',
 'grid_zone_3_4',
 'grid_zone_5_6',
 'grid_zone_5_5',
 'grid_zone_2_2',
 'temp',
 'RH',
 'grid_zone_4_4',
 'DMC',
 'grid_zone_8_6',
 'grid_zone_4_6',
 'grid_zone_2_4',
 'wind',
 'grid_zone_1_2',
 'grid_zone_6_3',
 'grid_zone_6_5',
 'grid_zone_1_3',
 'grid_zone_3_3',
 'area']

In [27]:
X_train_filter_reduced.dtypes

grid_zone_8_3    float64
rain_binned      float64
grid_zone_2_3    float64
grid_zone_3_6    float64
grid_zone_9_9    float64
grid_zone_1_5    float64
grid_zone_3_5    float64
grid_zone_6_6    float64
grid_zone_9_5    float64
grid_zone_3_4    float64
grid_zone_5_6    float64
grid_zone_5_5    float64
grid_zone_2_2    float64
temp             float64
RH               float64
grid_zone_4_4    float64
DMC              float64
grid_zone_8_6    float64
grid_zone_4_6    float64
grid_zone_2_4    float64
wind             float64
grid_zone_1_2    float64
grid_zone_6_3    float64
grid_zone_6_5    float64
grid_zone_1_3    float64
grid_zone_3_3    float64
area             float64
dtype: object

In [28]:
X_test_reduced = X_test_processed[optimized_features]
X_test_reduced

Unnamed: 0,grid_zone_8_3,rain_binned,grid_zone_2_3,grid_zone_3_6,grid_zone_9_9,grid_zone_1_5,grid_zone_3_5,grid_zone_6_6,grid_zone_9_5,grid_zone_3_4,...,grid_zone_8_6,grid_zone_4_6,grid_zone_2_4,wind,grid_zone_1_2,grid_zone_6_3,grid_zone_6_5,grid_zone_1_3,grid_zone_3_3,area
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.227273,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.636364,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.227273,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-0.590909,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.227273,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.636364,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.000000,0.0,0.0,0.0,0.0,0.0,1.0
101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.227273,0.0,0.0,0.0,0.0,0.0,0.0
102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.409091,0.0,0.0,0.0,0.0,0.0,1.0


In [29]:
optimized_features

['grid_zone_8_3',
 'rain_binned',
 'grid_zone_2_3',
 'grid_zone_3_6',
 'grid_zone_9_9',
 'grid_zone_1_5',
 'grid_zone_3_5',
 'grid_zone_6_6',
 'grid_zone_9_5',
 'grid_zone_3_4',
 'grid_zone_5_6',
 'grid_zone_5_5',
 'grid_zone_2_2',
 'temp',
 'RH',
 'grid_zone_4_4',
 'DMC',
 'grid_zone_8_6',
 'grid_zone_4_6',
 'grid_zone_2_4',
 'wind',
 'grid_zone_1_2',
 'grid_zone_6_3',
 'grid_zone_6_5',
 'grid_zone_1_3',
 'grid_zone_3_3',
 'area']