In [None]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

In [None]:
# Load the data
data = load_iris(as_frame=True)
df = data.frame
print(data.feature_names)
print(data.target_names)
print(type(data.target))
print(data.target)
print(np.unique(data.target))

X = data.data
y = data.target

In [None]:
# Show scatter plot matrix
scatter_matrix(df, figsize=(10,10), diagonal='kde', c= y, cmap='viridis')
plt.show()

In [None]:
# Show boxplot
df.iloc[:, :-1].plot.box(figsize=(10,4), ylabel='cm')
plt.show()

In [None]:
# Show basic statistics
df.describe()

In [None]:
# We generate a heatmap to visualize correlations
sns.heatmap(df.iloc[:, :-1].corr(), annot=True, cmap="coolwarm")
plt.xticks(rotation=60)
plt.show()


We observe that the different classes are quite well separated, there are not many outliers (only in sepal width), and the dataset is perfectly balanced. The variables have similar scales and, moreover, petal length and petal width show greater variability, suggesting they may be more informative for classification.

No data normalization will be performed, as the techniques used (decision trees and random forests) do not require it.


In [None]:
# Split the dataset into training and test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

In [None]:
# Create and train the model
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
# Evaluate the model
error_rate = 1 - accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred, target_names=data.target_names))
print("Error rate:", error_rate)

In [None]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=data.target_names)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix (k=1)")
plt.show()

The model performs very well, achieving 97% accuracy and making only two errors, both between the Versicolor and Virginica classes.

In [None]:
# Additionally, we test other values of K

param_grid = {"n_neighbors": [1, 2, 3, 4, 5, 6, 7]}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5)
grid.fit(X_train, y_train)

results = pd.DataFrame(grid.cv_results_)
print(results[["param_n_neighbors", "mean_test_score", "std_test_score"]])  # We observe that the results are quite similar.