# How do the accuracies of random forests and decision tree models compare?
1) Build a decision tree. <br>
2) Try to match decision tree with the simplest random forest you can. <br>
3) Measure simplicity with runtime. Compare with decision tree. <br>
<br>
[Daily Demand Forecasting Orders Dataset](https://archive.ics.uci.edu/ml/datasets/Daily+Demand+Forecasting+Orders) <br>
The database was collected during 60 days, this is a real database of a Brazilian logistics company. The dataset has 12 predictive attributes and a target that is the total of orders for daily treatment.

In [101]:
# Import modules.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn import tree
from IPython.display import Image

In [102]:
# Load data.
df = pd.read_csv('~/src/data/unit3/Daily_Demand_Forecasting_Orders.csv', sep=';')
print(df.shape)
df.columns

(60, 13)


Index(['Week of the month (first week second third fourth or fifth week',
       'Day of the week (Monday to Friday)', 'Non-urgent order',
       'Urgent order', 'Order type A', 'Order type B', 'Order type C',
       'Fiscal sector orders', 'Orders from the traffic controller sector',
       'Banking orders (1)', 'Banking orders (2)', 'Banking orders (3)',
       'Target (Total orders)'],
      dtype='object')

## Clean the data

In [103]:
newColNames = ['week_of_month', 'day_of_week', 'non_urgent_order',
               'urgent_order', 'order_type_a', 'order_type_b',
               'order_type_c', 'fiscal_sector_orders', 'traffic_orders',
               'banking_1', 'banking_2', 'banking_3', 'target_total_orders']

df.columns = newColNames
print(df.shape)
df.head()

(60, 13)


Unnamed: 0,week_of_month,day_of_week,non_urgent_order,urgent_order,order_type_a,order_type_b,order_type_c,fiscal_sector_orders,traffic_orders,banking_1,banking_2,banking_3,target_total_orders
0,1,4,316.307,223.27,61.543,175.586,302.448,0.0,65556,44914,188411,14793,539.577
1,1,5,128.633,96.042,38.058,56.037,130.58,0.0,40419,21399,89461,7679,224.675
2,1,6,43.651,84.375,21.826,25.125,82.461,1.386,11992,3452,21305,14947,129.412
3,2,2,171.297,127.667,41.542,113.294,162.284,18.156,49971,33703,69054,18423,317.12
4,2,3,90.532,113.526,37.679,56.618,116.22,6.459,48534,19646,16411,20257,210.517


In [104]:
#df.dtypes
df.describe()

Unnamed: 0,week_of_month,day_of_week,non_urgent_order,urgent_order,order_type_a,order_type_b,order_type_c,fiscal_sector_orders,traffic_orders,banking_1,banking_2,banking_3,target_total_orders
count,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
mean,3.016667,4.033333,172.554933,118.92085,52.112217,109.22985,139.53125,77.396133,44504.35,46640.833333,79401.483333,23114.633333,300.873317
std,1.282102,1.401775,69.505788,27.170929,18.829911,50.741388,41.442932,186.50247,12197.905134,45220.736293,40504.420041,13148.039829,89.602041
min,1.0,2.0,43.651,77.371,21.826,25.125,74.372,0.0,11992.0,3452.0,16411.0,7679.0,129.412
25%,2.0,3.0,125.348,100.888,39.45625,74.91625,113.63225,1.24325,34994.25,20130.0,50680.5,12609.75,238.1955
50%,3.0,4.0,151.0625,113.1145,47.1665,99.482,127.99,7.8315,44312.0,32527.5,67181.0,18011.5,288.0345
75%,4.0,5.0,194.6065,132.10825,58.46375,132.171,160.1075,20.36075,52111.75,45118.75,94787.75,31047.75,334.23725
max,5.0,6.0,435.304,223.27,118.178,267.342,302.448,865.0,71772.0,210508.0,188411.0,73839.0,616.453


In [105]:
# For simplicity, set binary value for target_total_orders less/greater than 300.87
df['target_cat'] = df['target_total_orders'].apply(lambda x: 1 if x > 300.87 else 0)
df.head()

Unnamed: 0,week_of_month,day_of_week,non_urgent_order,urgent_order,order_type_a,order_type_b,order_type_c,fiscal_sector_orders,traffic_orders,banking_1,banking_2,banking_3,target_total_orders,target_cat
0,1,4,316.307,223.27,61.543,175.586,302.448,0.0,65556,44914,188411,14793,539.577,1
1,1,5,128.633,96.042,38.058,56.037,130.58,0.0,40419,21399,89461,7679,224.675,0
2,1,6,43.651,84.375,21.826,25.125,82.461,1.386,11992,3452,21305,14947,129.412,0
3,2,2,171.297,127.667,41.542,113.294,162.284,18.156,49971,33703,69054,18423,317.12,1
4,2,3,90.532,113.526,37.679,56.618,116.22,6.459,48534,19646,16411,20257,210.517,0


## Set our target and features

In [106]:
X = df.drop(['target_total_orders', 'target_cat'], axis=1)
X = pd.get_dummies(X)
Y = df['target_cat']

## Decision tree

In [107]:
# Packages for rendering our tree.
import pydotplus
#import graphviz

# Initialize and train our tree.
maxFeatures = 1 #number of features used per node.
maxDepth = 4 #number of decision levels below the root for our classification.
decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=maxFeatures,
    max_depth=maxDepth
)
decision_tree.fit(X, Y)
print(cross_val_score(decision_tree, X, Y, cv=10))

'''
# Render our tree.

dot_data = tree.export_graphviz(
    decision_tree, out_file=None,
    feature_names=customers.columns,
    class_names=['Not Returning', 'Returning'],
    filled=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
'''

# Run-time.
decision_tree_time_start = time.time()
print('Decision tree runtime: {}'.format(time.time() - decision_tree_time_start))

[0.85714286 0.85714286 0.85714286 0.85714286 0.66666667 0.66666667
 1.         0.6        1.         0.4       ]
Decision tree runtime: 0.0


# Standardize

In [108]:
from sklearn.preprocessing import StandardScaler
df = pd.DataFrame(StandardScaler().fit_transform(df))
df.columns = ['week_of_month', 'day_of_week', 'non_urgent_order',
               'urgent_order', 'order_type_a', 'order_type_b',
               'order_type_c', 'fiscal_sector_orders', 'traffic_orders',
               'banking_1', 'banking_2', 'banking_3', 'target_total_orders', 'target_cat']
df.head()

Unnamed: 0,week_of_month,day_of_week,non_urgent_order,urgent_order,order_type_a,order_type_b,order_type_c,fiscal_sector_orders,traffic_orders,banking_1,banking_2,banking_3,target_total_orders,target_cat
0,-1.586212,-0.02398,2.085656,3.87288,0.505067,1.318768,3.964285,-0.418489,1.740406,-0.038509,2.714011,-0.638259,2.686525,1.143544
1,-1.586212,0.695422,-0.63725,-0.84914,-0.752676,-1.05716,-0.217813,-0.418489,-0.337749,-0.562902,0.250452,-1.183895,-0.857585,-0.874475
2,-1.586212,1.414823,-1.870229,-1.282156,-1.621983,-1.671507,-1.388702,-0.410995,-2.687898,-0.963127,-1.446429,-0.626448,-1.929736,-0.874475
3,-0.799661,-1.462783,-0.018251,0.32461,-0.56609,0.080771,0.553647,-0.320318,0.451945,-0.288518,-0.257621,-0.359843,0.182851,1.143544
4,-0.799661,-0.743382,-1.190047,-0.200228,-0.772973,-1.045613,-0.567237,-0.383565,0.333144,-0.601995,-1.568275,-0.219177,-1.016928,-0.874475


## RFC

In [109]:
rfc = ensemble.RandomForestClassifier()
print(cross_val_score(rfc, X, Y, cv=10)) #cv= number of cross-val-scores

# Run-time.
rfc_time_start = time.time()
print('rfc runtime: {}'.format(time.time() - rfc_time_start))

[0.85714286 1.         0.85714286 1.         1.         0.66666667
 1.         0.8        1.         1.        ]
rfc runtime: 0.0
