In [1]:
import pandas as pd

DATA_PATH = 'datasets/temps.xlsx'

df = pd.read_excel(DATA_PATH)
df.head(3)

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual
0,2016,1,1,Fri,45,45,45.6,45
1,2016,1,2,Sat,44,45,45.7,44
2,2016,1,3,Sun,45,44,45.8,41


In [3]:
df = pd.get_dummies(df)

In [5]:
X = df.drop(columns=['actual'], axis=1)
y = df['actual']

features = X.columns

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

In [14]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

gbr_pred = gbr.predict(X_test)
print(f'R² -> {(gbr.score(X_test, y_test)).round(2)}\n\
Mean Absolute Error -> {(metrics.mean_absolute_error(y_test, gbr_pred)).round(2)}\n\
Mean Squared  Error -> {(metrics.mean_squared_error(y_test, gbr_pred)).round(2)}')

R² -> 0.79
Mean Absolute Error -> 4.08
Mean Squared  Error -> 28.53


In [17]:
from tables.file import parameters
from sklearn.model_selection import GridSearchCV

parameters  = {'learning_rate': [0.03],
               'subsample'    : [0.2],
               'n_estimators' : [100, 500, 1000, 1500],
               'max_depth'    : [8]
              }
grid_search = GridSearchCV(gbr, parameters, scoring='r2', cv=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [18]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_search.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_search.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_search.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 GradientBoostingRegressor(learning_rate=0.03, max_depth=8, subsample=0.2)

 The best score across ALL searched params:
 0.819905996135966

 The best parameters across ALL searched params:
 {'learning_rate': 0.03, 'max_depth': 8, 'n_estimators': 100, 'subsample': 0.2}


In [19]:
best_model = grid_search.best_estimator_
best_model.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.03,
 'loss': 'squared_error',
 'max_depth': 8,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 0.2,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [21]:
gbr_tunned = GradientBoostingRegressor(alpha=0.9,
ccp_alpha=0.0,
criterion='friedman_mse',
init=None,
learning_rate=0.03,
loss='squared_error',
max_depth=8,
max_features=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100,
n_iter_no_change=None,
random_state=None,
subsample=0.2,
tol=0.0001,
validation_fraction=0.1,
verbose=0,
warm_start= False)

gbr_tunned.fit(X_train, y_train)

In [22]:
gbr_tunned_pred = gbr_tunned.predict(X_test)
print(f'R² -> {(gbr_tunned.score(X_test, y_test)).round(2)}\n\
Mean Absolute Error -> {(metrics.mean_absolute_error(y_test, gbr_tunned_pred)).round(2)}\n\
Mean Squared  Error -> {(metrics.mean_squared_error(y_test, gbr_tunned_pred)).round(2)}')

R² -> 0.82
Mean Absolute Error -> 3.8
Mean Squared  Error -> 25.27
