In [57]:
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [58]:
df = pd.read_csv('iris.data', header=None, names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
# Imprime los primeros 5 registros
print(df.head())

   sepal_length  sepal_width  petal_length  petal_width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [59]:
# Dimension de los datos
print(df.shape)

(150, 5)


In [60]:
# Muestra información general
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


In [61]:
# Muestra estadística básica
print(df.describe())

       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000


In [62]:
# Muestra de distribución de clases
print(df['class'].value_counts())

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64


Algoritmos de clasificación

In [63]:
#Separar datos entre las variables de entrada y la variable de salida
X=df.drop('class', axis=1)
Y=df['class']

In [64]:
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=.1, random_state=10) #esta es la semilla de aleatoriedad, se separa entrenamiento y test, se separa 90-10
X_train, X_val, y_train, y_val= train_test_split(X_train, y_train, test_size=.2, random_state=10) #esto separa entrenamiento y validacion, se separa 80-20

In [65]:
svc_model = SVC()
dt_model = DecisionTreeClassifier()
knn_model = KNeighborsClassifier()

In [66]:
svc_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)

val_predict_svc = svc_model.predict(X_val)
val_predict_dt = dt_model.predict(X_val)
val_predict_knn = knn_model.predict(X_val)

In [67]:
test_predict_svc = svc_model.predict(X_test)
test_predict_dt = dt_model.predict(X_test)
test_predict_knn = knn_model.predict(X_test)

In [68]:
accuracy_score_svc = accuracy_score(y_true=y_val, y_pred=val_predict_svc)
accuracy_score_dt = accuracy_score(y_true=y_val, y_pred=val_predict_dt)
accuracy_score_knn = accuracy_score(y_true=y_val, y_pred=val_predict_knn)
[accuracy_score_svc, accuracy_score_dt, accuracy_score_knn]

[1.0, 0.8148148148148148, 0.9629629629629629]

In [69]:
accuracy_score_svc = accuracy_score(y_true=y_test, y_pred=test_predict_svc)
accuracy_score_dt = accuracy_score(y_true=y_test, y_pred=test_predict_dt)
accuracy_score_knn = accuracy_score(y_true=y_test, y_pred=test_predict_knn)
[accuracy_score_svc, accuracy_score_dt, accuracy_score_knn]

[0.9333333333333333, 0.8, 0.9333333333333333]

In [70]:
confusion_matrix(y_true=y_val, y_pred=val_predict_svc)

array([[ 8,  0,  0],
       [ 0, 13,  0],
       [ 0,  0,  6]], dtype=int64)

In [71]:
confusion_matrix(y_true=y_val, y_pred=val_predict_dt)

array([[8, 0, 0],
       [0, 9, 4],
       [0, 1, 5]], dtype=int64)

In [72]:
confusion_matrix(y_true=y_val, y_pred=val_predict_svc)

array([[ 8,  0,  0],
       [ 0, 13,  0],
       [ 0,  0,  6]], dtype=int64)

In [73]:
classes = dt_model.classes_
print(classes)

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
