In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
features = pd.read_excel('/content/temps.xlsx')
features.head()

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual
0,2016,1,1,Fri,45,45,45.6,45
1,2016,1,2,Sat,44,45,45.7,44
2,2016,1,3,Sun,45,44,45.8,41
3,2016,1,4,Mon,44,41,45.9,40
4,2016,1,5,Tues,41,40,46.0,44


In [3]:
features.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2016.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146
min,2016.0,1.0,1.0,35.0,35.0,45.1,35.0
25%,2016.0,3.0,8.0,54.0,54.0,49.975,54.0
50%,2016.0,6.0,15.0,62.5,62.5,58.2,62.5
75%,2016.0,10.0,23.0,71.0,71.0,69.025,71.0
max,2016.0,12.0,31.0,117.0,117.0,77.4,92.0


In [4]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 348 entries, 0 to 347
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   year     348 non-null    int64  
 1   month    348 non-null    int64  
 2   day      348 non-null    int64  
 3   week     348 non-null    object 
 4   temp_2   348 non-null    int64  
 5   temp_1   348 non-null    int64  
 6   average  348 non-null    float64
 7   actual   348 non-null    int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 21.9+ KB


In [5]:
features = pd.get_dummies(features)
features.head()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2016,1,1,45,45,45.6,45,1,0,0,0,0,0,0
1,2016,1,2,44,45,45.7,44,0,0,1,0,0,0,0
2,2016,1,3,45,44,45.8,41,0,0,0,1,0,0,0
3,2016,1,4,44,41,45.9,40,0,1,0,0,0,0,0
4,2016,1,5,41,40,46.0,44,0,0,0,0,0,1,0


In [6]:
labels = np.array(features['actual'])

features = features.drop('actual', axis = 1)

feature_list = list(features.columns)

features = np.array(features)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [9]:
from pandas.core import base
baseline_preds = test_features[:, feature_list.index('average')]

baseline_error = abs(baseline_preds - test_labels)
print('Baseline Error Average:', round(np.mean(baseline_error), 2))

Baseline Error Average: 5.06


In [10]:
from sklearn.datasets import make_moons
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor


In [11]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(train_features, train_labels)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [12]:
prediction_rf = rf.predict(test_features)

error_rf = abs(prediction_rf - test_labels)

r_sq = rf.score(features,labels)
print('R^2: ', r_sq)
print('MAE: ', metrics.mean_absolute_error(test_labels, prediction_rf))
print('MSE: ', metrics.mean_squared_error(test_labels, prediction_rf))

R^2:  0.932094797587982
MAE:  3.932057471264368
MSE:  26.68358100000001


In [16]:
ada = AdaBoostRegressor(n_estimators = 100)
ada.fit(train_features, train_labels)

ada_pred = ada.predict(test_features)

error_ada = abs(ada_pred - test_labels)

r_sq = ada.score(features,labels)
print('R^2: ', r_sq)
print('MAE: ', metrics.mean_absolute_error(test_labels, ada_pred))
print('MSE: ', metrics.mean_squared_error(test_labels, ada_pred))

R^2:  0.876034382300654
MAE:  3.797524345255376
MSE:  24.281903032234425


In [17]:
gbr = GradientBoostingRegressor(n_estimators = 100)
gbr.fit(train_features, train_labels)

gbr_pred = gbr.predict(test_features)

In [18]:
error_gbr = abs(gbr_pred - test_labels)

r_sq = gbr.score(features,labels)
print('R^2: ', r_sq)
print('MAE: ', metrics.mean_absolute_error(test_labels, gbr_pred))
print('MSE: ', metrics.mean_squared_error(test_labels, gbr_pred))

R^2:  0.9175253199780987
MAE:  4.084301837234518
MSE:  28.546187141039944


In [21]:
rf = RandomForestRegressor(max_depth = 3)
rf.fit(train_features, train_labels)

tree = rf.estimators_[5]

tree

DecisionTreeRegressor(max_depth=3, max_features='auto', random_state=189980637)

In [22]:
from sklearn.tree import export_graphviz
import pydot

In [27]:
export_graphviz(tree, out_file = "tree.dot", feature_names = feature_list, rounded = True, precision = 1)
(graph, )= pydot.graph_from_dot_file("tree.dot")
graph.write_png("tree.png")

In [28]:
importances = list(rf.feature_importances_)

feature_importance = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

feature_importance = sorted(feature_importance, key = lambda x: x[1], reverse = True)

In [32]:
[print("Feature: {:20} Importance: {}".format(*pair)) for pair in feature_importance];

Feature: temp_1               Importance: 0.77
Feature: average              Importance: 0.21
Feature: temp_2               Importance: 0.01
Feature: year                 Importance: 0.0
Feature: month                Importance: 0.0
Feature: day                  Importance: 0.0
Feature: week_Fri             Importance: 0.0
Feature: week_Mon             Importance: 0.0
Feature: week_Sat             Importance: 0.0
Feature: week_Sun             Importance: 0.0
Feature: week_Thurs           Importance: 0.0
Feature: week_Tues            Importance: 0.0
Feature: week_Wed             Importance: 0.0


In [33]:
importances = list(ada.feature_importances_)

feature_importance = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

feature_importance = sorted(feature_importance, key = lambda x: x[1], reverse = True)

[print("Feature: {:20} Importance: {}".format(*pair)) for pair in feature_importance];

Feature: temp_1               Importance: 0.51
Feature: average              Importance: 0.25
Feature: temp_2               Importance: 0.08
Feature: month                Importance: 0.07
Feature: week_Mon             Importance: 0.04
Feature: day                  Importance: 0.03
Feature: week_Fri             Importance: 0.01
Feature: week_Sun             Importance: 0.01
Feature: year                 Importance: 0.0
Feature: week_Sat             Importance: 0.0
Feature: week_Thurs           Importance: 0.0
Feature: week_Tues            Importance: 0.0
Feature: week_Wed             Importance: 0.0


In [35]:
importances = list(gbr.feature_importances_)

feature_importance = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

feature_importance = sorted(feature_importance, key = lambda x: x[1], reverse = True)

[print("Feature: {:20} Importance: {}".format(*pair)) for pair in feature_importance];

Feature: temp_1               Importance: 0.63
Feature: average              Importance: 0.3
Feature: day                  Importance: 0.02
Feature: month                Importance: 0.01
Feature: temp_2               Importance: 0.01
Feature: week_Fri             Importance: 0.01
Feature: year                 Importance: 0.0
Feature: week_Mon             Importance: 0.0
Feature: week_Sat             Importance: 0.0
Feature: week_Sun             Importance: 0.0
Feature: week_Thurs           Importance: 0.0
Feature: week_Tues            Importance: 0.0
Feature: week_Wed             Importance: 0.0
