# Week7 - Decision Tree Lab

* Train-test split
* Train a decison tree model
* Train a random forest model
* Evaluate the models
* Explain findings

In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/log_reg/employee-turnover-balanced.csv')
y = df['left_company']
X = df.iloc[:, 1:]

In [82]:
numerical_vars=[ 'age','commuting_distance','last_raise_pct','total_years_working', 'years_at_company', 
                'years_in_current_job','years_since_last_promotion', 'years_with_current_supervisor']

categorical_vars=['frequency_of_travel', 'department','education', 'satisfaction_with_environment',
    'gender', 'seniority_level', 'position', 'satisfaction_with_job','married_or_single',  'last_performance_rating']

In [83]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y)

## Using Decision Tree

In [84]:
# insert code here
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([('impute_missing', SimpleImputer(strategy='median')),
                           ('standardize_num', StandardScaler())
                        ])

cat_pipeline = Pipeline([('impute_missing_cats', SimpleImputer(strategy='most_frequent')),
                          ('create_dummies_cats', OneHotEncoder(handle_unknown='ignore', drop='first'))])

processing_pipeline = ColumnTransformer(transformers=[('proc_numeric', num_pipeline, numerical_vars),
                                                      ('create_dummies', cat_pipeline, categorical_vars)])

print(processing_pipeline)

ColumnTransformer(transformers=[('proc_numeric',
                                 Pipeline(steps=[('impute_missing',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardize_num',
                                                  StandardScaler())]),
                                 ['age', 'commuting_distance', 'last_raise_pct',
                                  'total_years_working', 'years_at_company',
                                  'years_in_current_job',
                                  'years_since_last_promotion',
                                  'years_with_current_supervisor']),
                                ('create_dummies',
                                 Pipeline(steps=[('impute_missing_cats',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('create_dummies_cats',
                    

In [85]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(criterion='gini', random_state=1)
modeling_pipeline = Pipeline([
        ('data_processing', processing_pipeline),
        ('rf', tree_model)])
modeling_pipeline.fit(X_train, y_train)

Pipeline(steps=[('data_processing',
                 ColumnTransformer(transformers=[('proc_numeric',
                                                  Pipeline(steps=[('impute_missing',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardize_num',
                                                                   StandardScaler())]),
                                                  ['age', 'commuting_distance',
                                                   'last_raise_pct',
                                                   'total_years_working',
                                                   'years_at_company',
                                                   'years_in_current_job',
                                                   'years_since_last_promotion',
                                                   'years_with_current_su...
          

In [86]:
print(f'Test accuracy: {modeling_pipeline.score(X_test, y_test):.2f}')

Test accuracy: 0.80


## Using Random Forest


In [87]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=5,
                                       n_estimators=100, oob_score=True)
modeling_pipeline = Pipeline([
        ('data_processing', processing_pipeline),
        ('rf', classifier_rf)])
modeling_pipeline.fit(X_train, y_train)

Pipeline(steps=[('data_processing',
                 ColumnTransformer(transformers=[('proc_numeric',
                                                  Pipeline(steps=[('impute_missing',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardize_num',
                                                                   StandardScaler())]),
                                                  ['age', 'commuting_distance',
                                                   'last_raise_pct',
                                                   'total_years_working',
                                                   'years_at_company',
                                                   'years_in_current_job',
                                                   'years_since_last_promotion',
                                                   'years_with_current_su...
          

In [88]:
print(f'Test accuracy: {modeling_pipeline.score(X_test, y_test):.2f}')

Test accuracy: 0.76


In [89]:
classifier_rf.oob_score_

0.7306666666666667

In [90]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="accuracy")
modeling_pipeline = Pipeline([
        ('data_processing', processing_pipeline),
        ('rf', grid_search)])
modeling_pipeline.fit(X_train, y_train)


Fitting 4 folds for each of 180 candidates, totalling 720 fits


Pipeline(steps=[('data_processing',
                 ColumnTransformer(transformers=[('proc_numeric',
                                                  Pipeline(steps=[('impute_missing',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardize_num',
                                                                   StandardScaler())]),
                                                  ['age', 'commuting_distance',
                                                   'last_raise_pct',
                                                   'total_years_working',
                                                   'years_at_company',
                                                   'years_in_current_job',
                                                   'years_since_last_promotion',
                                                   'years_with_current_su...
          

In [91]:
grid_search.best_score_

0.7826686767550347

In [92]:
rf_best = grid_search.best_estimator_
rf_best

RandomForestClassifier(max_depth=20, min_samples_leaf=5, n_estimators=25,
                       n_jobs=-1, random_state=42)

# Using Decision Tree the accuracy is 81 while with using only Random Forest it is 76 and random forest with Grid Search the best score is 78. So it is almost giving same performance using both. Decision tree is giving more accuracy than random forest.