# Decision Tree

In [36]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn import tree

## Data

In [20]:
df = pd.read_csv('Auto.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [21]:
# Variable Types
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
year              int64
origin            int64
name             object
dtype: object

### Data Cleaning

In [22]:
# Cylinders should be Categorical
df['cylinders'] = df['cylinders'].astype(object)

In [27]:
# Change ? in horsepower should to 0
df['horsepower'].replace('?',0, inplace=True)

In [29]:
# Change horsepower to numeric
df['horsepower'] = df['horsepower'].astype(int)

In [31]:
# Origin should be Categorical
df['origin'] = df['origin'].astype(object)

### Data Splitting

In [None]:
# Get Features
X = df.drop(['mpg','name'], axis=1).copy()
X.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,8,307.0,130,3504,12.0,70,1
1,8,350.0,165,3693,11.5,70,1
2,8,318.0,150,3436,11.0,70,1
3,8,304.0,150,3433,12.0,70,1
4,8,302.0,140,3449,10.5,70,1


In [41]:
# Get Target Variable
y = df['mpg'].copy()
y.head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

In [43]:
# Split the data for Fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

## Fit

In [45]:
# Initialize the model
clf = tree.DecisionTreeRegressor(criterion='squared_error',
                                 splitter='best',
                                 max_depth=3,
                                 max_features=None,
                                 min_impurity_decrease=0,
                                 ccp_alpha=0,
                                 random_state=1)
clf.get_params()

{'ccp_alpha': 0,
 'criterion': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 1,
 'splitter': 'best'}

In [47]:

clf = clf.fit(X=X, y=y)

### Feature Importance

In [54]:
# What are the most important variables
features = pd.DataFrame(clf.feature_importances_, index=X_train.columns)
features

Unnamed: 0,0
cylinders,0.704571
displacement,0.0
horsepower,0.179465
weight,0.0
acceleration,0.0
year,0.115965
origin,0.0
