# Supervised methods 

Preparing data

In [None]:
# Common imports
import os
import glob
import numpy as np
import pandas as pd
import xarray as xr
from utils_data import * 
from utils_ml import *
import matplotlib.pyplot as plt

In [None]:
def plotprediction_TS(test_dates, final_predictions, test_labels):
    import seaborn as sns
    df_to_compare = pd.DataFrame({'date': test_dates, 'Actual': test_labels, 'Predicted': final_predictions})
    dfm = pd.melt(df_to_compare, id_vars=['date'], value_vars=['Actual', 'Predicted'], var_name='data', value_name='precip')
    f, axs = plt.subplots(1,2,
                      figsize=(12,5),
                      sharey=True)

    sns.regplot(data= df_to_compare,
                x="Actual",
                y="Predicted",
                ax=axs[0],
                )
    sns.lineplot(x='date', y='precip', hue = 'data', data=dfm, ax=axs[1])

In [None]:
# Paths
os.getcwd() 
DATADIR = '/Users/noeliaotero/Documents/CAS_ML/data/'

# Some constants
DATE_START = '1979-01-01'
DATE_END = '2020-12-31'

In [None]:
DIRCSV  = DATADIR + 'TS_CH/'
l_files = glob.glob(os.path.join(DIRCSV, 'df*.csv'))
df_vars =  read_csv_files(l_files, DATE_START, DATE_END)
df_vars.head()

In [None]:
# Add lag-t2mmean
df_vars['T2MLag'] = df_vars['T2MMEAN'].shift(1)
df_vars.head()

In [None]:
# read precipitaion
df_prec = get_precipitation_data ( DATADIR + 'TS_CH/precip_regions.csv', DATE_START, DATE_END)
# Select the right columns: date and reg_tot (all country)
df_prec = df_prec[['date','reg_tot']]

In [None]:
# Read Large scale-atmospheric PCs and Clusters
df_PCs = pd.read_csv(DATADIR + 'ERA5/PCdf.csv')
df_PCs['date'] = pd.DatetimeIndex(df_PCs['date']).normalize()
df_clusters =  pd.read_csv(DATADIR + 'ERA5/Cluster_spatialmean.csv')
df_clus    = df_clusters[['date','Cluster']]
df_clus['date'] = pd.DatetimeIndex(df_clus['date']).normalize()

In [None]:
# Merge everything
l_all = []
l_all.append(df_vars)
l_all.append(df_prec)
df_all = concat_dataframes(l_all)

Data exploration

In [None]:
import seaborn
seaborn.pairplot(df_all, vars=df_all.columns[1:9],
                 kind='reg')  

In [None]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
#from pandas.plotting import scatter_matrix

#scatter_matrix(df_input[df_input.columns[1:9]], figsize=(12, 8))

Prepare the data 

In [None]:
yy_train = [1979,2015]
yy_test  = [2016,2020]
ylabel = df_prec.columns[1]

In [None]:
# Add categorical variables
#df_input = pd.merge(df_all, df_clus)
df_input = pd.merge(df_all, df_PCs)
names_col = df_input.columns
# define attributes - i.e covariates
attributes = names_col.drop(['date','reg_tot'])
df_input.head()

In [None]:
# Plot Time series
from matplotlib import pyplot
df_input['reg_tot'].plot()
pyplot.show()

In [None]:
attributes

In [None]:
train_dataset, train_labels, test_dataset, test_labels, train_dates, test_dates = split_data(df_input, yy_train, yy_test, attributes, ylabel)

In [None]:
cat_var='Cluster'
fpipeline = prepareData(train_dataset, None)
X_prep_train = fpipeline.fit_transform(train_dataset)
X_prep_test = fpipeline.fit_transform(test_dataset)

In [None]:
X_prep_train

# Multiple linear regression

In [None]:
# Import some libraries
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

In [None]:
lr = LinearRegression(n_jobs=16)
lr.fit(X_prep_train, train_labels)


In [None]:
mse_train = mean_squared_error(train_labels, lr.predict(X_prep_train))
mse_test = mean_squared_error(test_labels, lr.predict(X_prep_test))
print(f'Train MSE = {mse_train}'); print(f'Test MSE = {mse_test}')
print(f'Train RMSE = {np.sqrt(mse_train)}'); print(f'Test RMSE = {np.sqrt(mse_test)}')

Do we want to apply model selection?
RFE (Recursive feature elimination)

In [None]:
rfe = RFE(lr)             
rfe = rfe.fit(X_prep_train, train_labels)
mean_squared_error(train_labels, rfe.predict(X_prep_train))

In [None]:
def plot_result(x, y):

    fig, ax = plt.subplots()
    ax.scatter(test_labels, preds)
    ax.plot([test_labels.min(), test_labels.max()], [test_labels.min(), test_labels.max()], 'k--', lw=1)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()

Apply cross-validation

In [None]:
# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validated:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


In [None]:
lr_cv_mse = cross_val_score(lr, X_prep_train, train_labels, scoring='neg_mean_squared_error', cv=10)
# We got the negative average MSE for cross-validation (minimizing MSE is equivalent to maximizing the negative MSE)
lr_cv_mse.mean()
# The result is close to what we obtained before. The negative result 

In [None]:
lin_rmse_scores = np.sqrt(-lr_cv_mse)
pd.Series(lin_rmse_scores).describe()

In [None]:
print("Folds: " + str(len(lr_cv_mse)) + ", MSE: " + str(np.mean(np.abs(lr_cv_mse))) + ", STD: " + str(np.std(lr_cv_mse)))


In [None]:
# get the coefficients
lr.coef_
#coeff_df = pd.DataFrame(lr.coef_, attributes, columns=['Coefficient'])
# makes some predictions
y_pred = lr.predict(X_prep_test)

In [None]:
plotprediction_TS(test_dates, y_pred, test_labels)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(X_prep_train, train_labels)
# make predictions
y_rf_pred = forest_reg.predict(X_prep_test)

In [None]:
mse_rf_train = mean_squared_error(train_labels, forest_reg.predict(X_prep_train))
mse_rf_test = mean_squared_error(test_labels, forest_reg.predict(X_prep_test))
print(f'Train MSE = {mse_rf_train}'); print(f'Test MSE = {mse_rf_test}')
print(f'Train RMSE = {np.sqrt(mse_rf_train)}'); print(f'Test RMSE = {np.sqrt(mse_rf_test)}')

In [None]:
# Now it can be noted overfitting problem, the RMSE is much higher for the test 

In [None]:
# Tunning parameter
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]
# Create the parameter grid based on the results of random search 
    
forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_prep_train, train_labels)
best_params = grid_search.best_params_

In [None]:
print ("Best Score:" , grid_search.best_score_)
print ("Best params:", best_params)

In [None]:
forest_GCV_reg = RandomForestRegressor(n_jobs=-1).set_params(**best_params)
forest_GCV_reg.fit(X_prep_train,train_labels)

In [None]:
y_rf_cv_predict = forest_GCV_reg.predict(X_prep_test)
mse_rf_cv_train = mean_squared_error(train_labels, forest_GCV_reg.predict(X_prep_train))
mse_rf_cv_test = mean_squared_error(test_labels, forest_GCV_reg.predict(X_prep_test))
print(f'Train MSE = {mse_rf_cv_train}'); print(f'Test MSE = {mse_rf_cv_test}')
print(f'Train RMSE = {np.sqrt(mse_rf_cv_train)}'); print(f'Test RMSE = {np.sqrt(mse_rf_cv_test)}')

In [None]:
plotprediction_TS(test_dates, y_rf_cv_predict, test_labels)

In [None]:
features_importance = forest_GCV_reg.feature_importances_

In [None]:
sorted_features_importance = sorted(zip(features_importance, attributes), reverse=True)

In [None]:
sorted_features_importance

# Predicting extremes

Logistic regression

In [None]:
df_prec_ex = precip_exceedance(df_prec)
df_prec_ex['reg_tot'] = df_prec_ex['reg_tot']*1

In [None]:
df_input_ex = df_input
# Replace reg_tot by the exceedances
df_input_ex['reg_tot'] = df_prec_ex['reg_tot']

In [None]:
train_dataset, train_labels, test_dataset, test_labels, train_dates, test_dates = split_data(df_input_ex, yy_train, yy_test, attributes, ylabel)
# but the data is already in the format (only the labels have been replace by the exceedances)

In [None]:
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression(solver='lbfgs', max_iter=1000)
logisticRegr.fit(X_prep_train, train_labels)

In [None]:
y_ex_pred=logisticRegr.predict(X_prep_test)

In [None]:
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(test_labels, y_ex_pred)
cnf_matrix