In [None]:
# Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix


from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.inspection import permutation_importance

In [None]:
# Load the data
data = load_iris(as_frame=True)
df = data.frame
print(data.feature_names)
print(data.target_names)
print(type(data.target))
print(data.target)
print(np.unique(data.target))

X = data.data
y = data.target

In [None]:
# Show scatter plot matrix
scatter_matrix(df, figsize=(10,10), diagonal='kde', c= y, cmap='viridis')
plt.show()

In [None]:
# Show boxplot
df.iloc[:, :-1].plot.box(figsize=(10,4), ylabel='cm')
plt.show()

In [None]:
# Show basic statistics
df.describe()


We observe that the different classes are quite well separated, there are not many outliers (only in sepal width), and the dataset is perfectly balanced. The variables have similar scales and, moreover, petal length and petal width show greater variability, suggesting they may be more informative for classification.

No data normalization will be performed, as the techniques used (decision trees and random forests) do not require it.


In [None]:
# Split the dataset into training and test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
results = []
for depth in range(1, 9):
    modelo = DecisionTreeClassifier(max_depth=depth, random_state=42)
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    metrica = {
    "max_depth": depth,
    "accuracy": accuracy_score(y_test, y_pred),
    "confusion_matrix": confusion_matrix(y_test, y_pred),
    "classification_report": classification_report(y_test, y_pred, output_dict=True, zero_division=0)}
    results.append(metrica)

results


In [None]:
# Train the tree with max_depth=3
modelo = DecisionTreeClassifier(max_depth=3, random_state=42)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel("Predicted Class")
plt.ylabel("True Class")
plt.title("Decision Tree Confusion Matrix (max_depth=3)")
plt.show()

In [None]:
# Visualize the tree
plt.figure(figsize=(12,8))
plot_tree(modelo,
          feature_names=data.feature_names,
          class_names=data.target_names,
          filled=True,
          rounded=True,
          fontsize=10)
plt.show()


The decision tree shows that Iris dataset classes can be clearly separated using petal and sepal features.
From a maximum depth of 3, the model achieves 100% accuracy on the test set,
indicating it can perfectly classify all classes without needing further depth.

Now we continue our analysis with Random Forest.

In [None]:
# Create the model and predict.
rf_modelo = RandomForestClassifier(n_estimators=100, random_state=42)
rf_modelo.fit(X_train, y_train)
y_pred_rf = rf_modelo.predict(X_test)

In [None]:
# Visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel("Predicted Class")
plt.ylabel("True Class")
plt.title("Random Forest Confusion Matrix")  # Fixed title
plt.show()

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_rf)
report = classification_report(y_test, y_pred_rf, output_dict=True, zero_division=0)
print("Random Forest Accuracy:", accuracy)
print("Classification report:", report)

Like the previous model, it achieves perfect results.

In [None]:
# Calculate feature importance
importancias = pd.Series(rf_modelo.feature_importances_, index=data.feature_names)
importancias = importancias.sort_values(ascending=False)

print(importancias)

In [None]:
# Calculate permutation importance
result = permutation_importance(rf_modelo, X_test, y_test, n_repeats=10, random_state=42)

perm_importancias = pd.Series(result.importances_mean, index=data.feature_names)
perm_importancias = perm_importancias.sort_values(ascending=False)
print(perm_importancias)

Both regular importance and permutation importance show that petal length and petal width are the most relevant
variables for Iris classification, while sepal dimensions contribute little to the model.