In [312]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
import seaborn as sns

In [313]:
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = 12, 8
plt.rcParams.update({'font.size': 10})

In [314]:
# Aquifers
Auser_path = pd.read_csv('./scaled_datasets/Aquifer_Auser_scaled.csv')
Doganella_path = pd.read_csv('./scaled_datasets/Aquifer_Doganella_scaled.csv')
Luco_path = pd.read_csv('C:/Users/shahi/Desktop/BW-2/Acea_Smart_Water_Analytics_ML/FeatureEngineering_Notebook/new_datasets/Aquifer_Luco.csv')

Petrignano_path = pd.read_csv('./scaled_datasets/Aquifer_Petrignano_scaled.csv')

In [315]:
# Lake
Bilancino_path = pd.read_csv('./scaled_datasets/Lake_Bilancino_scaled.csv')

In [316]:
# River
Arno_path = pd.read_csv('./scaled_datasets/River_Arno_scaled.csv',index_col=False)

In [317]:
# Water springs
Amiata_path = pd.read_csv('./scaled_datasets/Water_Spring_Amiata_scaled.csv')
Lupa_path = pd.read_csv('./scaled_datasets/Water_Spring_Lupa_scaled.csv')
Madonna_path = pd.read_csv('./scaled_datasets/Water_Spring_Madonna_di_Canneto_scaled.csv')

In [318]:
Arno_path

Unnamed: 0,Rainfall_Le_Croci,Rainfall_Cavallina,Rainfall_S_Agata,Rainfall_Mangona,Rainfall_S_Piero,Rainfall_Vernio,Temperature_Firenze,Hydrometry_Nave_di_Rosano
0,0.147368,0.147368,0.147110,0.184057,0.136155,0.188232,0.914890,0.100275
1,0.147267,0.147267,0.147009,0.183931,0.136061,0.188103,0.914264,0.106813
2,0.147113,0.147113,0.146856,0.183739,0.135920,0.187907,0.913311,0.116052
3,0.147351,0.147351,0.147093,0.184036,0.136139,0.188211,0.914788,0.101365
4,0.147318,0.147318,0.147060,0.183995,0.136109,0.188168,0.914582,0.103546
...,...,...,...,...,...,...,...,...
8212,0.000000,0.000000,0.000000,0.000000,0.000000,0.200895,0.976439,0.078795
8213,0.000000,0.000000,0.000000,0.000000,0.000000,0.201010,0.976999,0.071192
8214,0.000000,0.000000,0.000000,0.000000,0.000000,0.200932,0.976617,0.076457
8215,0.000000,0.000000,0.000000,0.000000,0.000000,0.201027,0.977080,0.070021


In [319]:
# Column names for target variables

# targets = {
#     'Auser': [
#         'Depth_to_Groundwater_SAL',
#         'Depth_to_Groundwater_CoS',
#         'Depth_to_Groundwater_LT2'
#         ],
#     'Doganella': [
#         'Depth_to_Groundwater_Pozzo_1',
#         'Depth_to_Groundwater_Pozzo_2',
#         'Depth_to_Groundwater_Pozzo_3',
#         'Depth_to_Groundwater_Pozzo_4',
#         'Depth_to_Groundwater_Pozzo_5',
#         'Depth_to_Groundwater_Pozzo_6',
#         'Depth_to_Groundwater_Pozzo_7',
#         'Depth_to_Groundwater_Pozzo_8',
#         'Depth_to_Groundwater_Pozzo_9'
#         ],
#     'Luco': [
#         'Depth_to_Groundwater_Podere_Casetta'
#         ],
#     'Petrignano': [
#         'Depth_to_Groundwater_P24',
#         'Depth_to_Groundwater_P25'
#         ],
#     'Bilancino': [
#         'Lake_Level', 
#         'Flow_Rate'
#         ],
#     'Arno': [
#         'Hydrometry_Nave_di_Rosano'
#         ],
#     'Amiata': [
#         'Flow_Rate_Bugnano',
#         'Flow_Rate_Arbure',
#         'Flow_Rate_Ermicciolo',
#         'Flow_Rate_Galleria_Alta'
#         ],
#     'Lupa': [
#         'Flow_Rate_Lupa'
#         ],
#     'Madonna': [
#         'Flow_Rate_Madonna_di_Canneto'
#         ]
#     }

In [320]:
Arno_X = Arno_path.drop(['Hydrometry_Nave_di_Rosano'],axis=1)

In [321]:
Arno_X.shape

(8217, 7)

In [322]:
Arno_y = Arno_path['Hydrometry_Nave_di_Rosano']

In [323]:
X_train,X_test,y_train,y_test = train_test_split(Arno_X,Arno_y,test_size=0.2,random_state=0)

In [324]:
from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor

from sklearn               import metrics

import time

In [325]:
from sklearn.linear_model import LinearRegression,LogisticRegression
tree_classifiers = {
  "Regression":    LinearRegression(),
  "Decision Tree": DecisionTreeRegressor(),
  "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
  "Random Forest": RandomForestRegressor(n_estimators=100),
  "AdaBoost":      AdaBoostRegressor(n_estimators=100),
  "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
  "XGBoost":       XGBRegressor(n_estimators=100),
  "LightGBM":      LGBMRegressor(n_estimators=100),
  "CatBoost":      CatBoostRegressor(n_estimators=100)
}

In [326]:
rang = abs(y_train.max()) + abs(y_train.min())

In [327]:
results = pd.DataFrame({'Model': [], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})

for model_name, model in tree_classifiers.items():
    
    start_time = time.time()
    model.fit(X_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(X_test)
    
    results = results.append({"Model":    model_name,
                              "MSE": metrics.mean_squared_error(y_test, pred),
                              "MAB": metrics.mean_absolute_error(y_test, pred),
                              " % error": metrics.mean_squared_error(y_test, pred) / rang,
                              "Time":     total_time},
                              ignore_index=True)

Learning rate set to 0.358415
0:	learn: 0.0756201	total: 3.44ms	remaining: 340ms
1:	learn: 0.0639810	total: 7.11ms	remaining: 349ms
2:	learn: 0.0567613	total: 10.4ms	remaining: 336ms
3:	learn: 0.0522302	total: 13.5ms	remaining: 324ms
4:	learn: 0.0487116	total: 17ms	remaining: 322ms
5:	learn: 0.0466078	total: 20ms	remaining: 314ms
6:	learn: 0.0448047	total: 23.8ms	remaining: 317ms
7:	learn: 0.0432126	total: 27ms	remaining: 311ms
8:	learn: 0.0423218	total: 30.4ms	remaining: 308ms
9:	learn: 0.0415285	total: 33.3ms	remaining: 300ms
10:	learn: 0.0408133	total: 36.8ms	remaining: 297ms
11:	learn: 0.0398504	total: 39.9ms	remaining: 293ms
12:	learn: 0.0393382	total: 43.3ms	remaining: 290ms
13:	learn: 0.0388591	total: 46.4ms	remaining: 285ms
14:	learn: 0.0382309	total: 49.7ms	remaining: 282ms
15:	learn: 0.0377449	total: 53.4ms	remaining: 280ms
16:	learn: 0.0372146	total: 57.2ms	remaining: 279ms
17:	learn: 0.0366855	total: 60.3ms	remaining: 275ms
18:	learn: 0.0363070	total: 63.4ms	remaining: 270m

In [328]:
results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')

print(results_ord)

           Model       MSE       MAB   % error      Time
1    Extra Trees  0.000958  0.011956  0.000966  1.404242
2       CatBoost  0.001246  0.017144  0.001255  0.362211
3        XGBoost  0.001450  0.015195  0.001462  0.789886
4       LightGBM  0.001491  0.015706  0.001502  0.130651
5  Random Forest  0.001653  0.014287  0.001666  2.755598
6        Skl GBM  0.001803  0.021127  0.001817  1.110067
7  Decision Tree  0.003107  0.018763  0.003132  0.049865
8       AdaBoost  0.005551  0.062598  0.005594  0.921538
9     Regression  0.007359  0.059364  0.007417  0.003992
