In [3]:
import pandas as pd
import numpy as np

In [4]:
features = pd.read_excel('temps.xlsx')
features.head()

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual
0,2016,1,1,Fri,45,45,45.6,45
1,2016,1,2,Sat,44,45,45.7,44
2,2016,1,3,Sun,45,44,45.8,41
3,2016,1,4,Mon,44,41,45.9,40
4,2016,1,5,Tues,41,40,46.0,44


In [6]:
features = pd.get_dummies(features)
features.head()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2016,1,1,45,45,45.6,45,True,False,False,False,False,False,False
1,2016,1,2,44,45,45.7,44,False,False,True,False,False,False,False
2,2016,1,3,45,44,45.8,41,False,False,False,True,False,False,False
3,2016,1,4,44,41,45.9,40,False,True,False,False,False,False,False
4,2016,1,5,41,40,46.0,44,False,False,False,False,False,True,False


In [7]:
labels = np.array(features['actual'])

features = features.drop('actual', axis=1)
features_list = list(features.columns)

features = np.array(features)

In [8]:
from sklearn.model_selection import train_test_split

In [11]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.25, random_state=42)

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

In [14]:
gbr = GradientBoostingRegressor()
gbr.fit(train_features, train_labels)
gbr_pred = gbr.predict(test_features)

In [18]:
r_sq = gbr.score(test_features, test_labels)
print('R²:', r_sq)

print('mae', mean_absolute_error(test_labels, gbr_pred))


R²: 0.7951474192559388
mae 4.073633612352935


In [25]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'learning_rate': [0.02, 0.03, 0.04],
    'subsample': [0.1, 0.2, 0.3],
    'n_estimators': [100, 500, 100, 1500],
    'max_depth': [7, 8, 9]
}

grid_search = GridSearchCV(gbr, parameters, scoring='r2', cv=2, n_jobs=1)

In [26]:
grid_search.fit(train_features, train_labels)

In [27]:
print('Resultados do GridSearch')
print('Melhores estimadores: ', grid_search.best_estimator_)
print('Melhores parametros: ', grid_search.best_params_)

Resultados do GridSearch
Melhores estimadores:  GradientBoostingRegressor(learning_rate=0.04, max_depth=8, subsample=0.2)
Melhores parametros:  {'learning_rate': 0.04, 'max_depth': 8, 'n_estimators': 100, 'subsample': 0.2}


In [28]:
best_model = grid_search.best_estimator_

In [30]:
best_model.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.04,
 'loss': 'squared_error',
 'max_depth': 8,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 0.2,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [33]:
gbr_tunned = GradientBoostingRegressor(
    alpha = 0.9,
    ccp_alpha = 0.0,
    criterion= 'friedman_mse',
    init= None,
    learning_rate= 0.04,
    loss= 'squared_error',
    max_depth= 8,
    max_features= None,
    max_leaf_nodes= None,
    min_impurity_decrease= 0.0,
    min_samples_leaf= 1,
    min_samples_split= 2,
    min_weight_fraction_leaf= 0.0,
    n_estimators= 100,
    n_iter_no_change= None,
    random_state= None,
    subsample= 0.2,
    tol= 0.0001,
    validation_fraction= 0.1,
    verbose= 0,
    warm_start= False
    )

In [35]:
gbr_tunned.fit(train_features, train_labels)
gbr_tunned_pred = gbr_tunned.predict(test_features)

In [37]:
r_sq = gbr_tunned.score(test_features, test_labels)

print('R² :', r_sq)
print('mae: ', mean_absolute_error(test_labels, gbr_tunned_pred))
print('mse: ', mean_squared_error(test_labels, gbr_tunned_pred))


R² : 0.8001330056385156
mae:  3.9580666574546415
mse:  27.79526974089453
