In [2]:
%matplotlib inline

In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor

# Tree and Ensamble Methods
## Regression: Live Demos


In [4]:
mpg_data = pd.read_fwf("auto-mpg.data", header = None, na_values = ["?"])

In [5]:
mpg_data.columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin", "car name"]

In [6]:
mpg_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,"""plymouth satellite"""
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,"""amc rebel sst"""
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,"""ford torino"""


In [7]:
mpg_data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight          float64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object

In [8]:
mpg_data.horsepower.unique()

array([130., 165., 150., 140., 198., 220., 215., 225., 190., 170., 160.,
        95.,  97.,  85.,  88.,  46.,  87.,  90., 113., 200., 210., 193.,
        nan, 100., 105., 175., 153., 180., 110.,  72.,  86.,  70.,  76.,
        65.,  69.,  60.,  80.,  54., 208., 155., 112.,  92., 145., 137.,
       158., 167.,  94., 107., 230.,  49.,  75.,  91., 122.,  67.,  83.,
        78.,  52.,  61.,  93., 148., 129.,  96.,  71.,  98., 115.,  53.,
        81.,  79., 120., 152., 102., 108.,  68.,  58., 149.,  89.,  63.,
        48.,  66., 139., 103., 125., 133., 138., 135., 142.,  77.,  62.,
       132.,  84.,  64.,  74., 116.,  82.])

In [9]:
mpg_data.shape

(398, 9)

In [10]:
mpg_data = mpg_data.dropna(how='any')

In [11]:
mpg_data.shape

(392, 9)

In [12]:
mpg_data = mpg_data.drop("car name", axis = 1)

In [13]:
mpg_data_attributes = mpg_data.drop("mpg", axis = 1)
mpg_data_target = mpg_data["mpg"]

In [14]:
mpg_data_attributes_scaled = MinMaxScaler().fit_transform(mpg_data_attributes)

In [18]:
attributes_train, attributes_test, target_train, target_test = train_test_split(
    mpg_data_attributes_scaled,
    mpg_data_target,
    test_size=0.2,
    random_state=42
)

In [94]:
tree = DecisionTreeRegressor(max_depth=7, min_samples_leaf=2)

In [95]:
tree.fit(attributes_train, target_train)

DecisionTreeRegressor(max_depth=7, min_samples_leaf=2)

In [96]:
tree.score(attributes_train, target_train)

0.9582774943090832

In [97]:
tree.score(attributes_test, target_test)

0.8847103302964388

In [32]:
tree.get_depth() # Tree's overfitting -> too many branches (depth)

7

In [33]:
tree.feature_importances_

array([0.00434421, 0.64383106, 0.17436089, 0.05706424, 0.01697957,
       0.09968802, 0.00373201])

In [34]:
list(zip(mpg_data_attributes.columns, tree.feature_importances_))

[('cylinders', 0.004344209438807405),
 ('displacement', 0.6438310604105031),
 ('horsepower', 0.17436089185468442),
 ('weight', 0.05706424107290886),
 ('acceleration', 0.016979569813597353),
 ('model year', 0.09968801756368674),
 ('origin', 0.00373200984581214)]

## Forest bit
