In [2]:
import pandas as pd
import numpy as np

DATA_PATH = 'temps.xlsx'

df = pd.read_excel(DATA_PATH)
df

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual
0,2016,1,1,Fri,45,45,45.6,45
1,2016,1,2,Sat,44,45,45.7,44
2,2016,1,3,Sun,45,44,45.8,41
3,2016,1,4,Mon,44,41,45.9,40
4,2016,1,5,Tues,41,40,46.0,44
...,...,...,...,...,...,...,...,...
343,2016,12,27,Tues,42,42,45.2,47
344,2016,12,28,Wed,42,47,45.3,48
345,2016,12,29,Thurs,47,48,45.3,48
346,2016,12,30,Fri,48,48,45.4,57


In [3]:
df.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2016.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146
min,2016.0,1.0,1.0,35.0,35.0,45.1,35.0
25%,2016.0,3.0,8.0,54.0,54.0,49.975,54.0
50%,2016.0,6.0,15.0,62.5,62.5,58.2,62.5
75%,2016.0,10.0,23.0,71.0,71.0,69.025,71.0
max,2016.0,12.0,31.0,117.0,117.0,77.4,92.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   year     348 non-null    int64  
 1   month    348 non-null    int64  
 2   day      348 non-null    int64  
 3   week     348 non-null    object 
 4   temp_2   348 non-null    int64  
 5   temp_1   348 non-null    int64  
 6   average  348 non-null    float64
 7   actual   348 non-null    int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 21.9+ KB


In [5]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2016,1,1,45,45,45.6,45,True,False,False,False,False,False,False
1,2016,1,2,44,45,45.7,44,False,False,True,False,False,False,False
2,2016,1,3,45,44,45.8,41,False,False,False,True,False,False,False
3,2016,1,4,44,41,45.9,40,False,True,False,False,False,False,False
4,2016,1,5,41,40,46.0,44,False,False,False,False,False,True,False


In [6]:
X = df.drop(columns=['actual'], axis=1)
y = df['actual']

feature_list = list(df.columns)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
baseline_preds = X_test['average']

baseline_error = abs(baseline_preds - y_test)
print(f'Baseline error average: {round(np.mean(baseline_error), 2)}')

Baseline error average: 5.06


In [9]:
baseline_preds = X_test.iloc[:, feature_list.index('average')]

baseline_error = abs(baseline_preds - y_test)
print(f'Baseline error average: {round(baseline_error.mean(), 2)}')

Baseline error average: 5.06


In [10]:
from sklearn import metrics
from sklearn.datasets import make_moons
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

In [11]:
rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(X_train, y_train)

rfr_pred   = rfr.predict(X_test)
rfr_error  = abs(rfr_pred - y_test)
print(f'R² -> {(rfr.score(X, y)).round(2)}\n\
Mean Absolute Error -> {(metrics.mean_absolute_error(y_test, rfr_pred)).round(2)}\n\
Mean Squared  Error -> {(metrics.mean_squared_error(y_test, rfr_pred)).round(2)}')

R² -> 0.93
Mean Absolute Error -> 3.95
Mean Squared  Error -> 27.7


In [12]:
ada = AdaBoostRegressor(n_estimators=100, random_state=42)
ada.fit(X_train, y_train)

ada_pred   = ada.predict(X_test)
ada_error  = abs(ada_pred - y_test)
print(f'R² -> {(ada.score(X, y)).round(2)}\n\
Mean Absolute Error -> {(metrics.mean_absolute_error(y_test, ada_pred)).round(2)}\n\
Mean Squared  Error -> {(metrics.mean_squared_error(y_test, ada_pred)).round(2)}')

R² -> 0.88
Mean Absolute Error -> 3.65
Mean Squared  Error -> 23.25


In [13]:
gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr.fit(X_train, y_train)

gbr_pred   = gbr.predict(X_test)
gbr_error  = abs(gbr_pred - y_test)
print(f'R² -> {(gbr.score(X, y)).round(2)}\n\
Mean Absolute Error -> {(metrics.mean_absolute_error(y_test, gbr_pred)).round(2)}\n\
Mean Squared  Error -> {(metrics.mean_squared_error(y_test, gbr_pred)).round(2)}')

R² -> 0.92
Mean Absolute Error -> 4.08
Mean Squared  Error -> 28.5


In [14]:
rfr_tree = RandomForestRegressor(max_depth=3)
rfr_tree.fit(X_train, y_train)

tree = rfr_tree.estimators_[5]
tree

In [15]:
from sklearn.tree import export_graphviz
import pydot

export_graphviz(tree, out_file='tree.dot', feature_names=X.columns, rounded=True, precision=1)
(graph, ) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')

In [16]:
importances = list(rfr_tree.feature_importances_)
feature_importance = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
[print(f'Feature: {feature:20} Importante: {importance}') for feature, importance in feature_importance]

Feature: temp_1               Importante: 0.78
Feature: average              Importante: 0.2
Feature: month                Importante: 0.01
Feature: temp_2               Importante: 0.01
Feature: year                 Importante: 0.0
Feature: day                  Importante: 0.0
Feature: actual               Importante: 0.0
Feature: week_Fri             Importante: 0.0
Feature: week_Mon             Importante: 0.0
Feature: week_Sat             Importante: 0.0
Feature: week_Sun             Importante: 0.0
Feature: week_Thurs           Importante: 0.0
Feature: week_Tues            Importante: 0.0


[None, None, None, None, None, None, None, None, None, None, None, None, None]

In [17]:
importances = list(ada.feature_importances_)
feature_importance = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
[print(f'Feature: {feature:20} Importante: {importance}') for feature, importance in feature_importance]

Feature: temp_1               Importante: 0.47
Feature: average              Importante: 0.28
Feature: temp_2               Importante: 0.1
Feature: month                Importante: 0.06
Feature: day                  Importante: 0.04
Feature: week_Fri             Importante: 0.04
Feature: actual               Importante: 0.01
Feature: week_Sat             Importante: 0.01
Feature: year                 Importante: 0.0
Feature: week_Mon             Importante: 0.0
Feature: week_Sun             Importante: 0.0
Feature: week_Thurs           Importante: 0.0
Feature: week_Tues            Importante: 0.0


[None, None, None, None, None, None, None, None, None, None, None, None, None]

In [18]:
importances = list(gbr.feature_importances_)
feature_importance = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
[print(f'Feature: {feature:20} Importante: {importance}') for feature, importance in feature_importance]

Feature: temp_1               Importante: 0.63
Feature: average              Importante: 0.3
Feature: day                  Importante: 0.02
Feature: month                Importante: 0.01
Feature: temp_2               Importante: 0.01
Feature: actual               Importante: 0.01
Feature: year                 Importante: 0.0
Feature: week_Fri             Importante: 0.0
Feature: week_Mon             Importante: 0.0
Feature: week_Sat             Importante: 0.0
Feature: week_Sun             Importante: 0.0
Feature: week_Thurs           Importante: 0.0
Feature: week_Tues            Importante: 0.0


[None, None, None, None, None, None, None, None, None, None, None, None, None]

In [21]:
rfr.feature_importances_

array([0.        , 0.01436043, 0.03019373, 0.02529489, 0.729448  ,
       0.17209342, 0.00532598, 0.00475469, 0.00566643, 0.00378077,
       0.00215156, 0.00326134, 0.00366876])