In [3]:
#Importar librerias para el uso de Random Forest con sklearn regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error

from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV

In [4]:
# Importar csv que posee Consumo de energia y Temperatura de 23 provincias + CABA.
df = pd.read_csv("BASE.csv", header = 0, parse_dates = [0], index_col = 0, dayfirst=True)
df['Año'] = pd.DatetimeIndex(df.index).year
df['Mes'] = pd.DatetimeIndex(df.index).month
df['Trim'] = pd.DatetimeIndex(df.index).quarter
#Drop Energia BSAS
df = df.drop(['Energia BSAS'], axis = 1)
df.tail()

Unnamed: 0_level_0,Hora,Tipo Dia,Clima BSAS,Energia GBA,Clima GBA,Año,Mes,Trim
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-09-30 19:00:00,19,5,16.8,,16.8,2021,9,3
2021-09-30 20:00:00,20,5,16.9,,16.9,2021,9,3
2021-09-30 21:00:00,21,5,16.9,,16.9,2021,9,3
2021-09-30 22:00:00,22,5,16.9,,16.9,2021,9,3
2021-09-30 23:00:00,23,5,16.4,,16.4,2021,9,3


In [5]:
#split dataframe before september 2021
df_before = df[df.index < '2021-09-01 00:00:00']
df_before.tail()

Unnamed: 0_level_0,Hora,Tipo Dia,Clima BSAS,Energia GBA,Clima GBA,Año,Mes,Trim
Fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-08-31 19:00:00,19,3,19.5,5511.642,19.5,2021,8,3
2021-08-31 20:00:00,20,3,19.4,5980.252,19.4,2021,8,3
2021-08-31 21:00:00,21,3,20.0,6096.688,20.0,2021,8,3
2021-08-31 22:00:00,22,3,19.1,5955.714,19.1,2021,8,3
2021-08-31 23:00:00,23,3,19.0,5490.539,19.0,2021,8,3


In [28]:
#Checking for missing values
df_before.isnull().sum()

Hora           0
Tipo Dia       0
Clima BSAS     0
Energia GBA    0
Clima GBA      0
Año            0
Mes            0
Trim           0
dtype: int64

In [30]:
#split dataframe in X_train, y_train and X_test, y_test for Energia GBA
X_train, X_test, y_train, y_test= train_test_split(df_before.iloc[:,0:1], df_before.iloc[:,1], test_size = 0.2, random_state = 0)



In [31]:
def train_model(X_train, y_train, X_test, y_test):
    #Create a list of n_estimators from 10 to 100
    n_estimators = list(range(10, 100, 10))
    #Create a list of max_features from 1 to 10
    max_features = list(range(1, 11))
    #Create a list of max_depth from 1 to 10
    max_depth = list(range(1, 11))
    #Create a list of min_samples_split from 1 to 10
    min_samples_split = list(range(1, 11))
    #Create a list of min_samples_leaf from 1 to 10
    min_samples_leaf = list(range(1, 11))
    #Create a list of bootstrap from True to False
    bootstrap = [True, False]
    #Create a list of oob_score from True to False
    oob_score = [True, False]
    #Create a list of random_state from 0 to 100
    random_state = list(range(0, 100))
    #Create a list of scoring from 'neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'explained_variance', 'neg_mean_squared_log_error', 'neg_median_absolute_error'
    scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'explained_variance', 'neg_mean_squared_log_error', 'neg_median_absolute_error']
    #Create a list of estimator from 'RandomForestRegressor'
    estimator = ['RandomForestRegressor']
    #Create a list of parameters from all the lists above
    parameters = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap, 'oob_score': oob_score, 'random_state': random_state, 'scoring': scoring, 'estimator': estimator}
    #Create a dictionary of all the parameters
    grid_search = {'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'bootstrap': [True, False], 'oob_score': [True, False], 'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], 'scoring': ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2', 'explained_variance', 'neg_mean_squared_log_error', 'neg_median_absolute_error']}
    #Create a RandomForestRegressor object
    rf = RandomForestRegressor()
    #Create a GridSearchCV object
    grid_search = GridSearchCV(estimator = rf, param_grid = grid_search, cv = 5, n_jobs = -1)
    #Fit the GridSearchCV object to the training data
    grid_search.fit(X_train, y_train)
    #Print the best parameters
    print('Best parameters: {}'.format(grid_search.best_params_))
    #Print the best score
    print('Best score: {}'.format(grid_search.best_score_))
    #Print the score for each parameter combination
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    params = grid_search.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    #Create a dataframe with the results
    results = pd.DataFrame(grid_search.cv_results_)
    #Save the dataframe to a csv file
    results.to_csv('results.csv')
    #Return the best estimator
    return grid_search.best_estimator_
#Create a function to predict the values of the test set
def predict_values(model, X_test):
    #Predict the values of the test set
    y_pred = model.predict(X_test)
    #Return the predictions
    return y_pred
#Create a function to calculate the error
def calculate_error(y_test, y_pred):
    #Calculate the error
    error = mean_squared_error(y_test, y_pred)
    #Return the error
    return error
#Create a function to plot the results
def plot_results(y_test, y_pred):
    #Plot the results
    plt.scatter(y_test, y_pred)
    plt.xlabel('True Values')
    plt.ylabel('Predictions')
    plt.show()
#Create a function to plot the error
def plot_error(error):
    #Plot the error
    plt.plot(error)
    plt.xlabel('Iterations')
    plt.ylabel('Mean Squared Error')
    plt.show()



In [33]:
#Call the functions
model = train_model(X_train, y_train, X_test, y_test)
y_pred = predict_values(model, X_test)
error = calculate_error(y_test, y_pred)
plot_results(y_test, y_pred)
plot_error(error)
