In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn import set_config
from sklearn.metrics import explained_variance_score

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV 
from pprint import pprint

In [None]:
os.chdir(r"/home/jovyan/energy_consumption")

X = pd.read_csv("data_train_base.csv", index_col=0)
y = pd.read_csv("labels_train_base.csv", index_col=0)
X_test_final = pd.read_csv("data_test_base.csv", index_col=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Baseline model : linear regression

In [None]:
clf = LinearRegression()
cross_val_score(clf, X, y, cv=5, scoring='explained_variance')

In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
explained_variance_score(y_test,y_pred)

In [None]:
assert y_pred.shape[0] == X_test.shape[0]


# Regression TREE

In [None]:
from sklearn import tree
clf = tree.DecisionTreeRegressor(max_depth=3)
clf = clf.fit(X, y)

## First approach to feature importance

In [None]:
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None, 
                      feature_names=X.columns,  
                      class_names=y.columns,  
                      filled=True, rounded=True,  
                      special_characters=True)

graph = graphviz.Source(dot_data)  
graph 

In [None]:

max_depths = range(1, 15)
training_error = []
for max_depth in max_depths:
    model_1 = tree.DecisionTreeRegressor(max_depth=max_depth)
    model_1.fit(X, y)
    training_error.append(round(explained_variance_score(y, model_1.predict(X)), 5))
    
testing_error = []
for max_depth in max_depths:
    model_2 = tree.DecisionTreeRegressor(max_depth=max_depth)
    model_2.fit(X_train, y_train)
    testing_error.append(round(explained_variance_score(y_test, model_2.predict(X_test)),5))

plt.plot(max_depths, training_error, color='blue', label='Training error')
plt.plot(max_depths, testing_error, color='green', label='Testing error')
plt.xlabel('Tree depth')
plt.ylabel('Variance explained')
plt.title('Hyperparameter Tuning', pad=15, size=15)
plt.legend()

# Gradient Boosting

In [None]:
reg = GradientBoostingRegressor(random_state=0)

In [None]:
reg.fit(X_train, y_train)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
explained_variance_score(y_test, y_pred)

# Random forest with Grid search

In [None]:

n_estimators = [int(x) for x in np.linspace(start = 2, stop = 15, num = 6)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 20, num = 7)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6]

# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 3, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
print(rf_random.best_params_)

In [None]:
y_pred = rf_random.predict(X_test)
explained_variance_score(y_test, y_pred)