In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , confusion_matrix

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
df = pd.read_csv('temps.csv')
df

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,friend
0,2019,1,1,Fri,45,45,45.6,45,29
1,2019,1,2,Sat,44,45,45.7,44,61
2,2019,1,3,Sun,45,44,45.8,41,56
3,2019,1,4,Mon,44,41,45.9,40,53
4,2019,1,5,Tues,41,40,46.0,44,41
...,...,...,...,...,...,...,...,...,...
343,2019,12,27,Tues,42,42,45.2,47,47
344,2019,12,28,Wed,42,47,45.3,48,58
345,2019,12,29,Thurs,47,48,45.3,48,65
346,2019,12,30,Fri,48,48,45.4,57,42


In [6]:
df.isna().sum()

year       0
month      0
day        0
week       0
temp_2     0
temp_1     0
average    0
actual     0
friend     0
dtype: int64

In [7]:
df.dtypes

year         int64
month        int64
day          int64
week        object
temp_2       int64
temp_1       int64
average    float64
actual       int64
friend       int64
dtype: object

In [8]:
df.shape

(348, 9)

In [9]:
# One-hot encode categorical features
df = pd.get_dummies(df)
df.head(5)

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2019,1,1,45,45,45.6,45,29,1,0,0,0,0,0,0
1,2019,1,2,44,45,45.7,44,61,0,0,1,0,0,0,0
2,2019,1,3,45,44,45.8,41,56,0,0,0,1,0,0,0
3,2019,1,4,44,41,45.9,40,53,0,1,0,0,0,0,0
4,2019,1,5,41,40,46.0,44,41,0,0,0,0,0,1,0


In [10]:
df.shape

(348, 15)

## Features and Labels

In [11]:
x = df.drop(columns='actual' , axis = 1)
y = df['actual']

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size= 0.2)

In [15]:
print('x_train Shape:', x_train.shape)
print('y_train Shape:', y_train.shape)
print('x_test Features Shape:', x_test.shape)
print('y_test Labels Shape:', y_test.shape)

x_train Shape: (278, 14)
y_train Shape: (278,)
x_test Features Shape: (70, 14)
y_test Labels Shape: (70,)


## Training the Forest

In [18]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Train the model on training data
rf.fit(x_train , y_train)

## Make Predictions on Test Data

In [27]:
# Use the forest's predict method on the test data
predictions = rf.predict(x_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 3.87 degrees.


In [28]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 93.83 %.


## Visualizing a Single Decision Tree

In [32]:
feature_list = list(x.columns)
# Pull out one tree from the forest
tree = rf.estimators_[5]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
graph.write_png('tree.png'); 

In [34]:
print('The depth of this tree is:', tree.tree_.max_depth)

The depth of this tree is: 16
