# Set Up
This section contains the import of the various libraries used

In [2]:
# Standard libraries
import pandas as pd
import os
import numpy as np

In [3]:
# From sklearn
# Data processing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Classification model
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Evaluation metrics
from sklearn import metrics
from sklearn.metrics import classification_report

In [19]:
# For serialization
import pickle

# Manage dataset
This section is used to read the various datasets and creat the chosen dataset

## Reading the dataset

In [13]:
# Import principal DataFrame
parameters = pd.read_csv('Data/training_parameters.csv')

# Display the principal DataFrame
parameters.head()

Unnamed: 0,SurfaceType,RainfallIntensity,init_max_hour,DrainageSystemCapacity,GreenSpaceRatio,ObservationIndex
0,D,5,3,0.11,0.11,0
1,B,5,4,0.2,0.29,1
2,A,2,5,0.15,0.25,2
3,A,2,4,0.25,0.25,3
4,C,4,5,0.16,0.18,4


In [217]:
# Import information about the town
edges = pd.read_csv('Data/edge_info.csv')

# Display information
edges.head()

Unnamed: 0,head_id,tail_id,longitude,latitude,altitude
0,151779659,153066427,-95.382821,29.79874,957.0
1,151779659,152426116,-95.383237,29.798445,921.2
2,151779659,152136099,-95.382354,29.797764,876.0
3,151779763,152437059,-95.380082,29.795224,897.0
4,151779763,152541741,-95.380394,29.796295,877.6


## Selecting features from all files
All files contain a lot of information, some of them redundant.
The information contained in the 'edge_info' file and the first two columns (head_id and tail_id) of the files in the training folder are the same for each acquisition, so I decided to delete them.

To highlight the 191 roads, I decided to name each input/output (flooded_init/flooded_final) for each road in a separate column.
This way I can also consider the interdependence between the roads.

### Concatenting the selected features in one dataset
Starting from the parameters set, I create two columns, one for flooded_init and the other for flooded_final, to fill with all the corresponding streets values.

In [223]:
# Concatenating flooded_init and flooded_final with the DataFrame
data_folder = 'Data/training'

# Create empty columns for each feature in parameters DataFrame
features = ['flooded_init', 'flooded_final']
for feature in features:
    parameters[feature] = None

for index, row in parameters.iterrows():
    file_path = os.path.join(data_folder, str(row['ObservationIndex']) + '.csv')

    # Assuming the file is a CSV with header and target column
    file_data = pd.read_csv(file_path)

    new_features = file_data[['flooded_init', 'flooded_final']].copy()

    new_features['flooded_init'] = new_features['flooded_init'].astype(int)
    new_features['flooded_final'] = new_features['flooded_final'].astype(int)

    # Assign list of values to respective columns in parameters DataFrame
    for feature in features:
        parameters.at[index, feature] = list(new_features[feature].values)

# Display the updated DataFrame
parameters.head()

Unnamed: 0,SurfaceType,RainfallIntensity,init_max_hour,DrainageSystemCapacity,GreenSpaceRatio,ObservationIndex,flooded_init,flooded_final
0,D,4,4,0.22,0.28,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A,3,5,0.16,0.21,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,A,4,5,0.15,0.12,2,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, ..."
3,D,3,5,0.29,0.13,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,D,3,5,0.2,0.13,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


### Transforming the dataset into matrix format
I convert the last two list colunms into separate columns to allow a correct use for model selection.

In [224]:
# Convert lists in flooded_init and flooded_final column to separate columns
flooded_init_df = pd.DataFrame(parameters['flooded_init'].tolist(), columns=[f'flooded_init_{i}' for i in range(len(parameters['flooded_init'][0]))])
flooded_final_df = pd.DataFrame(parameters['flooded_final'].tolist(), columns=[f'flooded_final_{i}' for i in range(len(parameters['flooded_final'][0]))])

# Display the convertion
flooded_init_df.head()

Unnamed: 0,flooded_init_0,flooded_init_1,flooded_init_2,flooded_init_3,flooded_init_4,flooded_init_5,flooded_init_6,flooded_init_7,flooded_init_8,flooded_init_9,...,flooded_init_181,flooded_init_182,flooded_init_183,flooded_init_184,flooded_init_185,flooded_init_186,flooded_init_187,flooded_init_188,flooded_init_189,flooded_init_190
0,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [225]:
# Concatenate the new columns with the original DataFrame
dataset = pd.concat([parameters, flooded_init_df, flooded_final_df], axis=1)

# Display the updated DataFrame
dataset.head()

Unnamed: 0,SurfaceType,RainfallIntensity,init_max_hour,DrainageSystemCapacity,GreenSpaceRatio,ObservationIndex,flooded_init,flooded_final,flooded_init_0,flooded_init_1,...,flooded_final_181,flooded_final_182,flooded_final_183,flooded_final_184,flooded_final_185,flooded_final_186,flooded_final_187,flooded_final_188,flooded_final_189,flooded_final_190
0,D,4,4,0.22,0.28,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,...,1,1,0,0,0,0,0,0,0,0
1,A,3,5,0.16,0.21,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",0,0,...,0,0,0,0,0,0,0,0,0,1
2,A,4,5,0.15,0.12,2,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, ...",0,0,...,0,0,0,0,0,1,0,0,0,0
3,D,3,5,0.29,0.13,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,...,0,0,0,0,0,0,0,0,0,0
4,D,3,5,0.2,0.13,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",0,0,...,1,1,0,0,0,0,0,1,0,0


### Dropping columns

In [226]:
# Drop the repetitive and unnecessary column
dataset.drop('flooded_init', axis=1, inplace=True)
dataset.drop('flooded_final', axis=1, inplace=True)
dataset.drop('ObservationIndex', axis=1, inplace=True)

# Display the updated DataFrame
dataset.head()

Unnamed: 0,SurfaceType,RainfallIntensity,init_max_hour,DrainageSystemCapacity,GreenSpaceRatio,flooded_init_0,flooded_init_1,flooded_init_2,flooded_init_3,flooded_init_4,...,flooded_final_181,flooded_final_182,flooded_final_183,flooded_final_184,flooded_final_185,flooded_final_186,flooded_final_187,flooded_final_188,flooded_final_189,flooded_final_190
0,D,4,4,0.22,0.28,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
1,A,3,5,0.16,0.21,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,A,4,5,0.15,0.12,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,D,3,5,0.29,0.13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,D,3,5,0.2,0.13,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0


### Saving the dataset

In [227]:
# Save the created dataset
dataset.to_csv('Data/dataset.csv', index=False)

## Transforming the categorical features into numericals
For proper use of the classification model I transform the categorical feature 'surface' into numerical, in particular I separate in each column a surface type to give a different weight to them.

In [228]:
# OneHot encode the 'SurfaceType' column
one_hot_encoded = pd.get_dummies(dataset['SurfaceType'], prefix='Surface', dtype=int)
dataset = dataset.drop('SurfaceType', axis=1)
dataset = pd.concat([one_hot_encoded, dataset], axis=1)

# Display the final DataFrame
dataset.head()

Unnamed: 0,Surface_A,Surface_B,Surface_C,Surface_D,RainfallIntensity,init_max_hour,DrainageSystemCapacity,GreenSpaceRatio,flooded_init_0,flooded_init_1,...,flooded_final_181,flooded_final_182,flooded_final_183,flooded_final_184,flooded_final_185,flooded_final_186,flooded_final_187,flooded_final_188,flooded_final_189,flooded_final_190
0,0,0,0,1,4,4,0.22,0.28,0,0,...,1,1,0,0,0,0,0,0,0,0
1,1,0,0,0,3,5,0.16,0.21,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,4,5,0.15,0.12,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,1,3,5,0.29,0.13,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,3,5,0.2,0.13,0,0,...,1,1,0,0,0,0,0,1,0,0


## Checking unknown values

In [229]:
#Check if there is unknown
unknown = dataset.isnull().sum()

unknown

Surface_A            0
Surface_B            0
Surface_C            0
Surface_D            0
RainfallIntensity    0
                    ..
flooded_final_186    0
flooded_final_187    0
flooded_final_188    0
flooded_final_189    0
flooded_final_190    0
Length: 390, dtype: int64

## Scaling the whole dataset
I scale the whole dataset to consider each features in the same range. I use MinMaxScaler because there are many boolean features.

In [22]:
# Scale the whole DataFrame with MinMaxScaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(dataset)
data_scaled = pd.DataFrame(data_scaled)

data_scaled.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,380,381,382,383,384,385,386,387,388,389
0,0.0,0.0,0.0,1.0,1.0,0.0,0.05,0.05,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.5,0.5,0.95,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.997012,1.0,0.25,0.75,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.997012,0.5,0.75,0.75,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.999004,1.0,0.3,0.4,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.997012,0.0,0.85,0.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.997012,0.0,0.3,0.65,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7,0.0,0.0,1.0,0.0,0.999004,1.0,0.7,0.55,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,1.0,0.998008,0.5,0.5,0.7,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,1.0,0.0,0.997012,1.0,0.75,0.55,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


## Saving the final numerical and scaled datset

In [23]:
# Save the scaled dataset               
data_scaled.to_csv('Data/data_scaled.csv', index=False)

# Data preparation
This section is used to split the dataset.

## Separating train and test sets
To evaluate the performance of the generated model I divide the scaled dataset into training and test set.

In [8]:
# First 199 attributes are part of the input
X = data_scaled.iloc[:,:199]

# Last 191 attributes are the target
Y = data_scaled.iloc[:,199:]

# Separate train/test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

# Display size of train/test sets
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

# Display training set
X_train

(2250, 199) (750, 199) (2250, 191) (750, 191)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,189,190,191,192,193,194,195,196,197,198
2384,1.0,0.0,0.0,0.0,0.998008,1.0,0.75,0.75,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2719,1.0,0.0,0.0,0.0,0.999004,0.0,1.00,0.90,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1366,0.0,0.0,1.0,0.0,0.999004,0.5,0.15,0.95,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
107,0.0,1.0,0.0,0.0,0.998008,1.0,0.90,0.30,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2916,0.0,0.0,1.0,0.0,1.000000,0.5,0.75,0.20,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,0.0,0.0,0.0,1.0,0.999004,0.0,0.30,0.05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1804,1.0,0.0,0.0,0.0,0.998008,0.0,0.70,0.10,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1399,0.0,1.0,0.0,0.0,0.999004,0.0,0.50,0.70,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,0.0,1.0,0.0,0.0,0.998008,0.5,0.65,0.35,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model selection
This section is created to evaluate most significant models and check their performance.
For random forest and support vector machien I used a multioutput classifier due to the target matrix compose by 191 element to predict.

## Random forest

In [83]:
# Define the classifier and the parameters grid
base_classifier = RandomForestClassifier()
multi_output_classifier = MultiOutputClassifier(base_classifier)

pipeline = Pipeline([('classifier', multi_output_classifier)])
param_grid = {
    'classifier__estimator__n_estimators': [100, 200, 300],
    'classifier__estimator__criterion': ['entropy', 'gini'],
    'classifier__estimator__max_depth': [50, 100],
    'classifier__estimator__min_samples_split': [4, 5, 10],
    'classifier__estimator__min_samples_leaf': [4, 5, 10],
}

gs_rf = GridSearchCV(pipeline,
                  param_grid,
                  cv=3,
                  scoring = 'f1_macro',
                  verbose=50,
                  n_jobs=-1,
                  refit=True)

# Train classifiers
gs_rf.fit(X_train, Y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [84]:
# Summarize the results of the gridsearch
print('***GRIDSEARCH RESULTS***')

print("Best score: %f using %s" % (gs_rf.best_score_, gs_rf.best_params_))
means = gs_rf.cv_results_['mean_test_score']
stds = gs_rf.cv_results_['std_test_score']
params = gs_rf.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

***GRIDSEARCH RESULTS***
Best score: 0.495368 using {'classifier__estimator__criterion': 'entropy', 'classifier__estimator__max_depth': 100, 'classifier__estimator__min_samples_leaf': 4, 'classifier__estimator__min_samples_split': 4, 'classifier__estimator__n_estimators': 300}
0.491242 (0.015389) with: {'classifier__estimator__criterion': 'entropy', 'classifier__estimator__max_depth': 50, 'classifier__estimator__min_samples_leaf': 4, 'classifier__estimator__min_samples_split': 4, 'classifier__estimator__n_estimators': 100}
0.493236 (0.012713) with: {'classifier__estimator__criterion': 'entropy', 'classifier__estimator__max_depth': 50, 'classifier__estimator__min_samples_leaf': 4, 'classifier__estimator__min_samples_split': 4, 'classifier__estimator__n_estimators': 200}
0.493160 (0.014371) with: {'classifier__estimator__criterion': 'entropy', 'classifier__estimator__max_depth': 50, 'classifier__estimator__min_samples_leaf': 4, 'classifier__estimator__min_samples_split': 4, 'classifier__

In [85]:
# Predict on the test set
best_model_rf = gs_rf.best_estimator_

predictions_rf = best_model_rf.predict(X_test)

In [86]:
# Evaluate predictions
metric = classification_report(Y_test, predictions_rf)

# Display metrics
print(metric)

              precision    recall  f1-score   support

           0       0.97      0.32      0.48       123
           1       1.00      0.48      0.65       149
           2       0.98      0.34      0.50       151
           3       0.96      0.42      0.58       122
           4       0.92      0.34      0.50       106
           5       0.97      0.30      0.46       102
           6       0.95      0.37      0.54       153
           7       0.95      0.32      0.47       117
           8       0.98      0.38      0.55       111
           9       0.92      0.46      0.62       175
          10       0.97      0.31      0.47       181
          11       0.94      0.42      0.58       153
          12       0.98      0.53      0.69       151
          13       0.98      0.32      0.48       165
          14       0.98      0.36      0.52       152
          15       0.97      0.56      0.71       132
          16       0.95      0.44      0.60       131
          17       0.96    

  _warn_prf(average, modifier, msg_start, len(result))


In [212]:
# Calculate the AUC
Y_probs_rf = best_model_rf.predict_proba(X_test)
Y_probs_array = np.array(Y_probs_rf)
Y_test_array = np.array(Y_test)
Y_test_trans = Y_test_array.T

auc = metrics.roc_auc_score(Y_test_trans,
                            Y_probs_array[:,:,1],
                            multi_class='ovr',
                            average='weighted')

# Display the AUC value
print('AUC: %.2f' % auc)

AUC: 0.95


## Neural network
I also tryed with more then 3 layers, but the results are lower than just 3.

In [9]:
# Define the classifier and the parameters grid
classifier = MLPClassifier()
parameters = {"hidden_layer_sizes":[(199, 192, 191),( 199, 191, 191)],
              "max_iter": [500000],
              "alpha": [0.1, 1, 10]}

gs_nn = GridSearchCV(classifier,
                  parameters,
                  cv=3,
                  scoring = 'f1_macro',
                  verbose=50,
                  n_jobs=-1,
                  refit=True)

# Train classifiers
gs_nn.fit(X_train, Y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [11]:
# Summarize the results of your gridsearch
print('***GRIDSEARCH RESULTS***')

print("Best score: %f using %s" % (gs_nn.best_score_, gs_nn.best_params_))
means = gs_nn.cv_results_['mean_test_score']
stds = gs_nn.cv_results_['std_test_score']
params = gs_nn.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

***GRIDSEARCH RESULTS***
Best score: 0.760384 using {'alpha': 1, 'hidden_layer_sizes': (199, 191, 191), 'max_iter': 500000}
0.743149 (0.002946) with: {'alpha': 0.1, 'hidden_layer_sizes': (199, 192, 191), 'max_iter': 500000}
0.744316 (0.002057) with: {'alpha': 0.1, 'hidden_layer_sizes': (199, 191, 191), 'max_iter': 500000}
0.756603 (0.003486) with: {'alpha': 1, 'hidden_layer_sizes': (199, 192, 191), 'max_iter': 500000}
0.760384 (0.001795) with: {'alpha': 1, 'hidden_layer_sizes': (199, 191, 191), 'max_iter': 500000}
0.628215 (0.015321) with: {'alpha': 10, 'hidden_layer_sizes': (199, 192, 191), 'max_iter': 500000}
0.602116 (0.005063) with: {'alpha': 10, 'hidden_layer_sizes': (199, 191, 191), 'max_iter': 500000}


In [12]:
# Predict on the test set
best_model_nn = gs_nn.best_estimator_
predictions_nn = best_model_nn.predict(X_test)

In [13]:
# Evaluate predictions
metric = classification_report(Y_test, predictions_nn)

# Display metrics
print(metric)

              precision    recall  f1-score   support

           0       0.78      0.71      0.75       111
           1       0.81      0.84      0.83       132
           2       0.84      0.79      0.81       141
           3       0.77      0.77      0.77       124
           4       0.80      0.80      0.80       123
           5       0.75      0.82      0.78       111
           6       0.74      0.82      0.78       147
           7       0.79      0.73      0.76       112
           8       0.78      0.81      0.79       124
           9       0.75      0.82      0.78       165
          10       0.81      0.81      0.81       158
          11       0.81      0.85      0.83       164
          12       0.84      0.81      0.83       163
          13       0.81      0.81      0.81       156
          14       0.77      0.76      0.76       147
          15       0.83      0.74      0.78       141
          16       0.80      0.81      0.81       135
          17       0.81    

  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Calculate the AUC
Y_probs_nn = best_model_nn.predict_proba(X_test)

auc = metrics.roc_auc_score(Y_test,
                            Y_probs_nn,
                            multi_class="ovr",
                            average="weighted")

# Display the AUC value
print('AUC: %.2f' % auc)

AUC: 0.97


## Support vector machine
I also tryed with rbf kernel, but the results are lower than others.

In [94]:
# Define the classifier and the parameters grid
base_classifier = SVC(probability=True)
multi_output_classifier = MultiOutputClassifier(base_classifier)

pipeline = Pipeline([('classifier', multi_output_classifier)])
param_grid = {
    'classifier__estimator__kernel': ['linear', 'poly'],
    'classifier__estimator__C': [.1, 1],
    'classifier__estimator__gamma': [1],
    'classifier__estimator__degree': [2, 3],
}

gs_svm = GridSearchCV(pipeline,
                      param_grid,
                      cv=3,
                      scoring = 'f1_macro',
                      verbose=50,
                      n_jobs=-1,
                      refit=True)

# Train classifiers
gs_svm.fit(X_train, Y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [95]:
 # Summarize the results of your gridsearch
print('***GRIDSEARCH RESULTS***')

print("Best score: %f using %s" % (gs_svm.best_score_, gs_svm.best_params_))
means = gs_svm.cv_results_['mean_test_score']
stds = gs_svm.cv_results_['std_test_score']
params = gs_svm.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

***GRIDSEARCH RESULTS***
Best score: 0.751686 using {'classifier__estimator__C': 1, 'classifier__estimator__degree': 2, 'classifier__estimator__gamma': 1, 'classifier__estimator__kernel': 'linear'}
0.568701 (0.004266) with: {'classifier__estimator__C': 0.1, 'classifier__estimator__degree': 2, 'classifier__estimator__gamma': 1, 'classifier__estimator__kernel': 'linear'}
0.746758 (0.001327) with: {'classifier__estimator__C': 0.1, 'classifier__estimator__degree': 2, 'classifier__estimator__gamma': 1, 'classifier__estimator__kernel': 'poly'}
0.568701 (0.004266) with: {'classifier__estimator__C': 0.1, 'classifier__estimator__degree': 3, 'classifier__estimator__gamma': 1, 'classifier__estimator__kernel': 'linear'}
0.694132 (0.004276) with: {'classifier__estimator__C': 0.1, 'classifier__estimator__degree': 3, 'classifier__estimator__gamma': 1, 'classifier__estimator__kernel': 'poly'}
0.751686 (0.000693) with: {'classifier__estimator__C': 1, 'classifier__estimator__degree': 2, 'classifier__est

In [139]:
# Predict on the test set
best_model_svm = gs_svm.best_estimator_

predictions_svm = best_model_svm.predict(X_test)

In [97]:
# Evaluate predictions
metric = classification_report(Y_test, predictions_svm)

# Display metrics
print(metric)

              precision    recall  f1-score   support

           0       0.84      0.61      0.71       123
           1       0.85      0.72      0.78       149
           2       0.86      0.72      0.78       151
           3       0.81      0.65      0.72       122
           4       0.83      0.69      0.75       106
           5       0.79      0.57      0.66       102
           6       0.83      0.77      0.80       153
           7       0.77      0.71      0.74       117
           8       0.81      0.76      0.78       111
           9       0.82      0.70      0.76       175
          10       0.86      0.69      0.77       181
          11       0.82      0.74      0.78       153
          12       0.82      0.74      0.78       151
          13       0.85      0.70      0.77       165
          14       0.87      0.66      0.75       152
          15       0.90      0.81      0.85       132
          16       0.82      0.72      0.77       131
          17       0.80    

  _warn_prf(average, modifier, msg_start, len(result))


In [214]:
# Calculate the AUC
Y_probs_svm = best_model_svm.predict_proba(X_test)
Y_probs_array = np.array(Y_probs_svm)
Y_test_array = np.array(Y_test)
Y_test_trans = Y_test_array.T

auc = metrics.roc_auc_score(Y_test_trans,
                            Y_probs_array[:,:,1],
                            multi_class='ovr',
                            average='weighted')

# Display the AUC value
print('AUC: %.2f' % auc)

AUC: 0.95


# Model comparison
This section summarize results of previous one.

## Results summarize

These are best models:

**Random forest**

Best score: 0.495368

Using:
* n_estimators: 300
* criterion: entropy
* max_depth: 100
* min_samples_split: 4
* min_samples_leaf: 4

**Neural network**

Best score: 0.760384

Using:
* alpha: 1
* hidden_layer_sizes: (199, 191, 191)
* max_iter: 500000

**Support vector machine**

Best score: 0.751686

Using:
* kernel: linear
* C: 1
* gamma: 1


---

The table below shows the results for evaluating the tested models.

Precision, recall and f1-score are calculated by weighted averaging method; instead the AUC is calculated with one-over-rest multiclass and weighted average.

MODEL | PRECISION | RECALL | F1-SCORE | AUC |
------|-----------|--------|----------|-----|
Random forest | 0.96 | 0.43 | 0.59 | 0.95 |
Neural network | 0.82 | 0.78 | 0.80 | 0.97 |
Support vector machine | 0.84 | 0.72 | 0.77 | 0.95 |


## Best model selection

According to the table, the best models are Neural network and SVM, instead random forest is not at the others level. The last model could be improved by changing the parameters due to the low recall index.

I decide to choose neural network with:
* alpha: 1
* hidden_layer_sizes: (199, 191, 191)
* max_iter: 500000



In [15]:
# Select the best model
model = best_model_nn

# Model training
After selecting the best model, I train it with all the given dataset.

In [16]:
# Train classifier
model.fit(X,Y)

## Serializing the model

In [25]:
# Serialize the classifier to a file named 'my_classifier.pkl'
with open('my_classifier.pkl', 'wb') as file:
    pickle.dump(model, file)

# Test set preparation
Starting from the test dataset, I recreate the dataset choosen in the section 'manage dataset'.

## Menage test dataset

In [230]:
# Import principal DataFrame
parameter = pd.read_csv('Data/test_parameters.csv')
# Concatenating flooded_init and flooded_final with the DataFrame
data_folder = 'Data/test'

# Create empty columns for each feature in parameters DataFrame
features = ['flooded_init']
for feature in features:
    parameter[feature] = None

for index, row in parameter.iterrows():
    file_path = os.path.join(data_folder, str(row['ObservationIndex']) + '.csv')

    # Assuming the file is a CSV with header and target column
    file_data = pd.read_csv(file_path)

    new_features = file_data[['flooded_init']].copy()

    new_features['flooded_init'] = new_features['flooded_init'].astype(int)

    # Assign list of values to respective columns in parameters DataFrame
    for feature in features:
        parameter.at[index, feature] = list(new_features[feature].values)

# Display dataset
parameter

Unnamed: 0,SurfaceType,RainfallIntensity,init_max_hour,DrainageSystemCapacity,GreenSpaceRatio,ObservationIndex,flooded_init
0,D,4,4,0.22,0.28,0,"[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A,3,5,0.16,0.21,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A,4,5,0.15,0.12,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,D,3,5,0.29,0.13,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,D,3,5,0.20,0.13,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
195,D,5,5,0.17,0.28,195,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
196,D,3,3,0.19,0.21,196,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
197,C,3,3,0.16,0.21,197,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
198,C,4,5,0.20,0.20,198,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [231]:
# Convert lists in flooded_init and flooded_final column to separate columns
flooded_init_df = pd.DataFrame(parameter['flooded_init'].tolist(), columns=[f'flooded_init_{i}' for i in range(len(parameter['flooded_init'][0]))])

# Concatenate the new columns with the original DataFrame
testset = pd.concat([parameter, flooded_init_df], axis=1)

# Display concatenated set
testset

Unnamed: 0,SurfaceType,RainfallIntensity,init_max_hour,DrainageSystemCapacity,GreenSpaceRatio,ObservationIndex,flooded_init,flooded_init_0,flooded_init_1,flooded_init_2,...,flooded_init_181,flooded_init_182,flooded_init_183,flooded_init_184,flooded_init_185,flooded_init_186,flooded_init_187,flooded_init_188,flooded_init_189,flooded_init_190
0,D,4,4,0.22,0.28,0,"[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A,3,5,0.16,0.21,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A,4,5,0.15,0.12,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,D,3,5,0.29,0.13,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,D,3,5,0.20,0.13,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,D,5,5,0.17,0.28,195,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,D,3,3,0.19,0.21,196,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,C,3,3,0.16,0.21,197,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,C,4,5,0.20,0.20,198,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [232]:
# Drop the unnecessary column
testset.drop('flooded_init', axis=1, inplace=True)
testset.drop('ObservationIndex', axis=1, inplace=True)

# OneHot encode the 'SurfaceType' column
one_hot_encoded = pd.get_dummies(testset['SurfaceType'], prefix='Surface', dtype=int)
testset = testset.drop('SurfaceType', axis=1)
testset = pd.concat([one_hot_encoded, testset], axis=1)

#Display dataset
testset

Unnamed: 0,Surface_A,Surface_B,Surface_C,Surface_D,RainfallIntensity,init_max_hour,DrainageSystemCapacity,GreenSpaceRatio,flooded_init_0,flooded_init_1,...,flooded_init_181,flooded_init_182,flooded_init_183,flooded_init_184,flooded_init_185,flooded_init_186,flooded_init_187,flooded_init_188,flooded_init_189,flooded_init_190
0,0,0,0,1,4,4,0.22,0.28,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,3,5,0.16,0.21,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,4,5,0.15,0.12,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,3,5,0.29,0.13,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,1,3,5,0.20,0.13,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,0,0,1,5,5,0.17,0.28,0,0,...,0,0,0,0,0,0,0,0,0,0
196,0,0,0,1,3,3,0.19,0.21,0,0,...,0,0,0,0,0,0,0,0,0,0
197,0,0,1,0,3,3,0.16,0.21,0,0,...,0,0,0,0,0,0,0,0,0,0
198,0,0,1,0,4,5,0.20,0.20,0,0,...,0,0,0,0,0,0,0,0,0,0


## Test set preparation

In [233]:
# Scale the whole DataFrame with MinMaxScaler
scaler = MinMaxScaler()
test_scaled = scaler.fit_transform(testset)
test_scaled = pd.DataFrame(test_scaled)

#Save the testset
test_scaled.to_csv('Data/test_scaled.csv', index=False)

#Display scaled set
test_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,189,190,191,192,193,194,195,196,197,198
0,0.0,0.0,0.0,1.0,0.666667,0.5,0.60,0.90,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.333333,1.0,0.30,0.55,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.666667,1.0,0.25,0.10,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.333333,1.0,0.95,0.15,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.333333,1.0,0.50,0.15,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,1.0,1.000000,1.0,0.35,0.90,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196,0.0,0.0,0.0,1.0,0.333333,0.0,0.45,0.55,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197,0.0,0.0,1.0,0.0,0.333333,0.0,0.30,0.55,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,0.0,0.0,1.0,0.0,0.666667,1.0,0.50,0.50,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model predicting
Predict the output with the test set.

In [22]:
# Create predictions
predictions = model.predict(test_scaled)

# Display predictions
predictions

array([[1, 1, 1, ..., 1, 0, 0],
       [1, 1, 1, ..., 1, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

# Creating predicitions files
Create similar training files with predictions.

In [26]:
# Define principal folders
data_folder = 'Data/test'
folder_path = 'predictions'

for index in np.arange(200):
    file_path = os.path.join(data_folder, str(index) + '.csv')

    # Read the file
    file_data = pd.read_csv(file_path)

    # Create a column of predicted flooded_final in boolean format
    pred = predictions[index, :].T
    df = pd.DataFrame({'flooded_predicted': pred.flatten()})
    df = df.astype(bool)

    # Concatenate the new columns with the original DataFrame
    predset = pd.concat([file_data,  df], axis=1)

    # Save the file in the specific folder
    file_name = str(index) + '.csv'
    save_path = f"{folder_path}/{file_name}"
    predset.to_csv(save_path, index=False)

# Display a file
predset

Unnamed: 0,head_id,tail_id,flooded_init,flooded_predicted
0,151779659,153066427,False,False
1,151779659,152426116,False,False
2,151779659,152136099,False,False
3,151779763,152437059,False,False
4,151779763,152541741,False,False
...,...,...,...,...
186,153295319,153341528,False,False
187,153444946,153444951,False,False
188,153456725,153456737,False,True
189,1191806314,1191806355,False,False
