In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
%matplotlib inline


Pick a dataset. It could be one you've worked with before or it could be a new one. Then build the best decision tree you can.

Now try to match that with the simplest random forest you can. For our purposes measure simplicity with runtime. Compare that to the runtime of the decision tree. This is imperfect but just go with it.

Runtime: https://stackoverflow.com/questions/1557571/how-do-i-get-time-of-a-python-programs-execution

In [2]:
zomato = pd.read_csv('../datasets/zomato/zomato.csv', encoding='ISO-8859-1')
zomato.columns = zomato.columns.str.replace('\s+', '_')
print(zomato.shape)
countries = {
    1 : 'India',
    14 : 'Australia',
    30 : 'Brazil',
    37 : 'Canada',
    94 : 'Indonesia',
    148 : 'New Zealand',
    162 : 'Philippines',
    166 : 'Qatar',
    184 : 'Singapore',
    189 : 'South Africa',
    191 : 'Sri Lanka',
    208 : 'Turkey',
    214 : 'UAE',
    215 : 'United Kingdom',
    216 : 'United States'
}

zomato['Country'] = [countries[code] for code in zomato['Country_Code']]
print(zomato.columns)
print(zomato.Currency.unique())

(9551, 21)
Index(['Restaurant_ID', 'Restaurant_Name', 'Country_Code', 'City', 'Address',
       'Locality', 'Locality_Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average_Cost_for_two', 'Currency', 'Has_Table_booking',
       'Has_Online_delivery', 'Is_delivering_now', 'Switch_to_order_menu',
       'Price_range', 'Aggregate_rating', 'Rating_color', 'Rating_text',
       'Votes', 'Country'],
      dtype='object')
['Botswana Pula(P)' 'Brazilian Real(R$)' 'Dollar($)' 'Emirati Diram(AED)'
 'Indian Rupees(Rs.)' 'Indonesian Rupiah(IDR)' 'NewZealand($)'
 'Pounds(\x8cÂ£)' 'Qatari Rial(QR)' 'Rand(R)' 'Sri Lankan Rupee(LKR)'
 'Turkish Lira(TL)']


In [7]:
print(zomato.Rating_text.unique())

['Excellent' 'Very Good' 'Good' 'Average' 'Not rated' 'Poor']


In [5]:
'''#currency translations
1 pula = 0.096 usd
1 real = .27 usd
1 dollar = 1 dollar
1 emirati diram = .27 usd
1 indian rupee = .015 usd
1 rupiah = .000072 usd
1 NZ dollar = .69 usd
1 pound = 1.32 usd
1 rial = .27 usd
1 rand = .073 usd
1 sri lankan rupee = .0063 usd
1 lira = .21 usd
'''
currtrans = dict(zip(zomato.Currency.unique(), [.096, .27, 1, .27, .015, .000072, .69, 1.32, .27, .073, .0063, .21]))

scaled_costs = []
i = 0
while i < len(zomato.index):
    scaled_costs.append(zomato.iloc[i]['Average_Cost_for_two']*currtrans[zomato.iloc[i]['Currency']])
    i+=1
zomato['Scaled_Cost'] = scaled_costs

In [8]:
print(zomato.columns)

Index(['Restaurant_ID', 'Restaurant_Name', 'Country_Code', 'City', 'Address',
       'Locality', 'Locality_Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average_Cost_for_two', 'Currency', 'Has_Table_booking',
       'Has_Online_delivery', 'Is_delivering_now', 'Switch_to_order_menu',
       'Price_range', 'Aggregate_rating', 'Rating_color', 'Rating_text',
       'Votes', 'Country', 'Scaled_Cost'],
      dtype='object')


In [43]:
#now time to make a decision tree
from sklearn import tree
zomato_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=20
)

predicted_correlations = ['Country', 'City', 'Locality', 'Cuisines', 'Scaled_Cost', 'Has_Table_booking', 'Has_Online_delivery',
            'Switch_to_order_menu', 'Price_range']

X = pd.get_dummies(zomato)
Y = zomato.Rating_text

zomato_tree.fit(X, Y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=20,
            max_features=1, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [44]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

zomato_forest = ensemble.RandomForestClassifier()
zomato_forest.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [45]:
import time

In [49]:
print('Decision tree:')
start_time = time.time()
print(cross_val_score(zomato_tree, X, Y, cv=10))
print('Runtime:')
print("--- %s seconds ---" % (time.time() - start_time))

print('Random forest:')
start_time = time.time()
print(cross_val_score(zomato_forest, X, Y, cv=8))
print('Runtime:')
print("--- %s seconds ---" % (time.time() - start_time))


Decision tree:
[0.38871473 0.39016736 0.39121339 0.39644351 0.39121339 0.39121339
 0.3895288  0.39098532 0.39139559 0.41071429]
Runtime:
--- 16.481951236724854 seconds ---
Random forest:
[0.82873851 0.99832776 1.         1.         1.         0.99748322
 1.         0.98236776]
Runtime:
--- 24.478501081466675 seconds ---


Going with the simplistic runtime comparison, it's taken the random forest model twice as long as the decision tree to run, but also with more than twice the accuracy. I'm still worried about overfitting though since the cross value scores are so high on the random forest. I think a score of 1 should be impossible, but I've seen it occur multiple times. But, the power of the random forest results in a much higher accuracy regardless.