# Visualizing Trees, Feature importance and Random Forest
- - -

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
iris = pd.read_csv("https://raw.githubusercontent.com/4data-lab/datasets/master/iris.csv")

In [None]:
iris.head()

In [None]:
iris.info()

In [None]:
X = iris.drop("variety", 1)
y = iris["variety"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.50, random_state=2)

In [None]:
DTC = DecisionTreeClassifier(random_state=1)
DTC.fit(X_train, y_train)

In [None]:
y_test_pred = DTC.predict(X_test)

In [None]:
print("Accuracy Score:", round(accuracy_score(y_test, y_test_pred),2))

In [None]:
confusion_matrix(y_test, y_test_pred)

- - -
### Graphviz


In [None]:
from sklearn.tree import export_graphviz

In [None]:
from graphviz import Source

In [None]:
print("Número de filas de X_train:", X_train.shape[0])

In [None]:
export_graphviz(DTC, out_file="file.dot", class_names=["Setosa", "Versicolor", "Virginica"], 
                feature_names=X.columns, impurity=False, filled=True)

In [None]:
Source.from_file('file.dot')

### Feature importance

In [None]:
print("Feature importances:", DTC.feature_importances_)

In [None]:
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")

    Ya tenemos los valores de Feature importance, 
    ahora obtendremos los nombres de las features de X y también una lista con el rango del número de features.

In [None]:
X_train.columns

In [None]:
X_train.shape[1]

In [None]:
number_of_features = list(range(X_train.shape[1]))

In [None]:
plt.barh(number_of_features, DTC.feature_importances_)
plt.yticks(number_of_features, X.columns)
plt.xlabel("Feature Importances")
plt.ylabel("Features")
plt.show()

- - -

##Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RF = RandomForestClassifier(max_depth=10, n_estimators = 100, random_state=0)
RF.fit(X_train, y_train)

In [None]:
y_test_rf = RF.predict(X_test)

In [None]:
print("Accuracy Score:", round(accuracy_score(y_test, y_test_rf),2))

In [None]:
confusion_matrix(y_test, y_test_rf)

In [None]:
print("Feature importances Random forest:", RF.feature_importances_)
plt.barh(number_of_features, RF.feature_importances_)
plt.yticks(number_of_features, X.columns)
plt.xlabel("Feature Importances")
plt.ylabel("Features")
plt.show()

In [None]:
# Recordad que desde Sklearn también podemos printar árboles
from sklearn import tree
fn=X.columns
fig, axes = plt.subplots(nrows = 1,ncols = 5,figsize = (20,4))
for index in range(0, 5):
    tree.plot_tree(RF.estimators_[index],
                   feature_names = fn, 
                   class_names=["Setosa", "Versicolor", "Virginica"],
                   filled = True,
                   impurity=False,
                   ax = axes[index]);

    axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
fig.savefig('rf_5trees.png')