# Build model to estimate the best Hyperparameters for the forest

In [None]:
# Follow literature advice and use 10-fold cross validation to avoid for overfitting


def calculate_hyperparameters(dataset, target, features):
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from pprint import pprint
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    
    
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}


    X=dataset[features] # Has to be an array]  # Features
    y=dataset[target]  # Labels


    # Use the random grid to search for best hyperparameters

    rf = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(X, y)

    print(rf_random.best_params_)
    
    

In [1]:
rf_random.best_params_

# Run Hyperparameter Calculation for Flood as label (target)

In [2]:
# Set features
import pandas as pd
import numpy as np

target_hyperparameter_calculation = 'IsFloodingPeriode' # Can be any kind of data type
features_hyperparameter_calculation = ['CO2','abnormal_Co2_leakage','PM25', 'PM10', 'Temperature', 'Humidity', 'Pressure', 'Precipitation','mine_water_level', 'ground_water_level','RhineWaterLevel','Stream_water_level','Discharge'] # Has to be an array

# Call hyperparameter model
dataset = pd.read_csv('/Users/jan-philippviefhues/Desktop/UNI/Maastricht/um/Thesis/data/datasets/cleaned_datasets/total_dataset_with_precipitation_and_dummies_hourly.csv')

calculate_hyperparameters(dataset, target_hyperparameter_calculation,features_hyperparameter_calculation)

In [3]:
# Train the model with the defined hyperparameters and calculate the accuracy

print(f'Train Accuracy - : {rf_random.score(X_train, y_train):.3f}')
print(f'Test Accuracy - : {rf_random.score(X_test, y_test):.3f}')

# The accuracy rates on both sets are euqal enough. We dont have overfitting issue here

### Hyperparameter Calculation for Flood only 8 Features

In [4]:
target_hyperparameter_calculation = 'IsFloodingPeriode' # Can be any kind of data type
eight_features_hyperparameter_calculation = ['mine_water_level','ground_water_level','Stream_water_level', 'RhineWaterLevel', 'Humidity','Temperature','Discharge','Pressure'] # Has to be an array

dataset = pd.read_csv('/Users/jan-philippviefhues/Desktop/UNI/Maastricht/um/Thesis/data/datasets/cleaned_datasets/total_dataset_with_precipitation_and_dummies_hourly.csv')

calculate_hyperparameters(dataset, target_hyperparameter_calculation,eight_features_hyperparameter_calculation)

# Run Hyperparameter Calculation for CO2 leakage as label (target)

In [5]:
# Before Data set

# Set features
target_hyperparameter_calculation_abnormalCO2 = 'abnormal_Co2_leakage' # Can be any kind of data type
features_hyperparameter_calculation_abnormalCO2 = ['IsFloodingPeriode','PM25', 'PM10', 'Temperature', 'Humidity', 'Pressure', 'Precipitation','mine_water_level', 'ground_water_level','RhineWaterLevel','Stream_water_level','Discharge'] # Has to be an array

# Call hyperparameter model
#dataset = pd.read_csv('/Users/jan-philippviefhues/Desktop/UNI/Maastricht/um/Thesis/data/datasets/cleaned_datasets/total_dataset_with_precipitation_and_dummies_hourly.csv')
dataset = pd.read_csv('/Users/jan-philippviefhues/Desktop/UNI/Maastricht/um/Thesis/data/datasets/cleaned_datasets/hourly_dataset.before.flooding_with_precipitation_and_dummies.csv')
calculate_hyperparameters(dataset, target_hyperparameter_calculation_abnormalCO2,features_hyperparameter_calculation_abnormalCO2)


# Result
#Fitting 10 folds for each of 100 candidates, totalling 1000 fits
# {'n_estimators': 1600, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}


In [None]:
# During Data set

# Set features
target_hyperparameter_calculation_abnormalCO2 = 'abnormal_Co2_leakage' # Can be any kind of data type
features_hyperparameter_calculation_abnormalCO2 = ['IsFloodingPeriode','PM25', 'PM10', 'Temperature', 'Humidity', 'Pressure', 'Precipitation','mine_water_level', 'ground_water_level','RhineWaterLevel','Stream_water_level','Discharge'] # Has to be an array

# Call hyperparameter model
#dataset = pd.read_csv('/Users/jan-philippviefhues/Desktop/UNI/Maastricht/um/Thesis/data/datasets/cleaned_datasets/total_dataset_with_precipitation_and_dummies_hourly.csv')
dataset = pd.read_csv('/Users/jan-philippviefhues/Desktop/UNI/Maastricht/um/Thesis/data/datasets/cleaned_datasets/hourly_dataset.during.flooding_with_precipitation_and_dummies.csv')
calculate_hyperparameters(dataset, target_hyperparameter_calculation_abnormalCO2,features_hyperparameter_calculation_abnormalCO2)


# Result
#Fitting 10 folds for each of 100 candidates, totalling 1000 fits
# best_parameters = {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 70, 'bootstrap': True}


In [6]:
# After Data set

# Set features
target_hyperparameter_calculation_abnormalCO2 = 'abnormal_Co2_leakage' # Can be any kind of data type
features_hyperparameter_calculation_abnormalCO2 = ['IsFloodingPeriode','PM25', 'PM10', 'Temperature', 'Humidity', 'Pressure', 'Precipitation','mine_water_level', 'ground_water_level','RhineWaterLevel','Stream_water_level','Discharge'] # Has to be an array

# Call hyperparameter model
#dataset = pd.read_csv('/Users/jan-philippviefhues/Desktop/UNI/Maastricht/um/Thesis/data/datasets/cleaned_datasets/total_dataset_with_precipitation_and_dummies_hourly.csv')
dataset = pd.read_csv('/Users/jan-philippviefhues/Desktop/UNI/Maastricht/um/Thesis/data/datasets/cleaned_datasets/hourly_dataset.after.flooding_with_precipitation_and_dummies.csv')
calculate_hyperparameters(dataset, target_hyperparameter_calculation_abnormalCO2,features_hyperparameter_calculation_abnormalCO2)


# Result
#Fitting 10 folds for each of 100 candidates, totalling 1000 fits
# {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 80, 'bootstrap': True}


In [7]:
# Total Data set

# Set features
target_hyperparameter_calculation_abnormalCO2 = 'abnormal_Co2_leakage' # Can be any kind of data type
features_hyperparameter_calculation_abnormalCO2 = ['IsFloodingPeriode','PM25', 'PM10', 'Temperature', 'Humidity', 'Pressure', 'Precipitation','mine_water_level', 'ground_water_level','RhineWaterLevel','Stream_water_level','Discharge'] # Has to be an array

# Call hyperparameter model

total_dataset = pd.read_csv('/Users/jan-philippviefhues/Desktop/UNI/Maastricht/um/Thesis/data/datasets/cleaned_datasets/total_dataset_with_precipitation_and_dummies_hourly.csv')
calculate_hyperparameters(total_dataset, target_hyperparameter_calculation_abnormalCO2,features_hyperparameter_calculation_abnormalCO2)


# Result
#Fitting 10 folds for each of 100 candidates, totalling 1000 fits
# {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 80, 'bootstrap': True}


In [8]:
# Train the model with the defined hyperparameters and calculate the accuracy

print(f'Train Accuracy - : {rf_random.score(X_train, y_train):.3f}')
print(f'Train Accuracy - : {rf_random.score(X_test, y_test):.3f}')

# The accuracy rates on both sets are euqal enough. We dont have overfitting issue here