##Ejercicio de PCA en base de datos iris

In [1]:
#librerias
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn import datasets as db
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA

from sklearn import datasets as db

In [2]:
#Escribimos la funcion de matrix de confusion

def plot_confusion_matrix(cm, labels):
    fig_cm = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count"),
                       x=labels, y=labels, color_continuous_scale='Viridis', text_auto = True,
                       title="Confusion Matrix")
    fig_cm.update_layout(coloraxis_showscale=False)
    fig_cm.show()

In [3]:
#Base de datos iris
iris=db.load_iris()

In [4]:
#Modificamos la Estructura de los datos
df = pd.DataFrame(iris['data'], columns=['Sepal.Length','Sepal.Width','Petal.Length','Petal.Width'])
df['Species']=pd.Categorical.from_codes(iris.target, categories=iris.target_names)
print(df)


     Sepal.Length  Sepal.Width  Petal.Length  Petal.Width    Species
0             5.1          3.5           1.4          0.2     setosa
1             4.9          3.0           1.4          0.2     setosa
2             4.7          3.2           1.3          0.2     setosa
3             4.6          3.1           1.5          0.2     setosa
4             5.0          3.6           1.4          0.2     setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  virginica
146           6.3          2.5           5.0          1.9  virginica
147           6.5          3.0           5.2          2.0  virginica
148           6.2          3.4           5.4          2.3  virginica
149           5.9          3.0           5.1          1.8  virginica

[150 rows x 5 columns]


In [5]:

X = iris.data  # Features
y = iris.target  # Labels

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)
pca_df_2d = pd.DataFrame(data=X_pca_2d, columns=['PC1', 'PC2'])
pca_df_2d['Species'] = y
pca_df_2d.head()

Unnamed: 0,PC1,PC2,Species
0,-2.264703,0.480027,0
1,-2.080961,-0.674134,0
2,-2.364229,-0.341908,0
3,-2.299384,-0.597395,0
4,-2.389842,0.646835,0


In [8]:

fig_2d = px.scatter(pca_df_2d, x='PC1', y='PC2', color='Species', template = 'plotly_white', title = 'PCA - 2 Components')
fig_2d.show()


In [9]:
explained_variance_2d = pca_2d.explained_variance_ratio_
print("Explained Variance Ratio (2D):", explained_variance_2d)

Explained Variance Ratio (2D): [0.72962445 0.22850762]


In [10]:
components = pd.DataFrame(abs(pca_2d.components_), columns=iris.feature_names)
components

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.521066,0.269347,0.580413,0.564857
1,0.377418,0.923296,0.024492,0.066942


##Clasificación

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['Species'], test_size=0.2, random_state=7)


In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [15]:

pca = PCA(n_components=2)
X_train_pca_2d = pca.fit_transform(X_train_scaled)
X_test_pca_2d = pca.transform(X_test_scaled)

##Prueba SVM

In [16]:
svm_pca_2d = SVC()
svm_pca_2d.fit(X_train_pca_2d, y_train)
svm_pca_2d_pred = svm_pca_2d.predict(X_test_pca_2d)

In [17]:

svm_pca_2d_accuracy = accuracy_score(y_test, svm_pca_2d_pred)
svm_pca_2d_precision = precision_score(y_test, svm_pca_2d_pred, average="micro")
svm_pca_2d_recall = recall_score(y_test, svm_pca_2d_pred, average="micro")
svm_pca_2d_f1 = f1_score(y_test, svm_pca_2d_pred, average="micro")
svm_pca_2d_report = classification_report(y_test, svm_pca_2d_pred)
print("SVM PCA 2D Classification Report:")
print(svm_pca_2d_report)


SVM PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.71      0.83      0.77        12
   virginica       0.78      0.64      0.70        11

    accuracy                           0.80        30
   macro avg       0.83      0.82      0.82        30
weighted avg       0.80      0.80      0.80        30



In [18]:
svm_pca_2d_cm = confusion_matrix(y_test, svm_pca_2d_pred)
plot_confusion_matrix(svm_pca_2d_cm, ['setosa','versicolor', 'virginica'])


##Prueba Naive Bayes

In [19]:
nb_pca_2d = GaussianNB()
nb_pca_2d.fit(X_train_pca_2d, y_train)
nb_pca_2d_pred = nb_pca_2d.predict(X_test_pca_2d)

In [21]:
nb_pca_2d_accuracy = accuracy_score(y_test, nb_pca_2d_pred)
nb_pca_2d_precision = precision_score(y_test, nb_pca_2d_pred, average="micro")
nb_pca_2d_recall = recall_score(y_test, nb_pca_2d_pred, average="micro")
nb_pca_2d_f1 = f1_score(y_test, nb_pca_2d_pred, average="micro")
nb_pca_2d_report = classification_report(y_test, nb_pca_2d_pred)
print("Naive Bayes PCA 2D reporte de clasificación:")
print(nb_pca_2d_report)


Naive Bayes PCA 2D reporte de clasificación:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.67      0.83      0.74        12
   virginica       0.75      0.55      0.63        11

    accuracy                           0.77        30
   macro avg       0.81      0.79      0.79        30
weighted avg       0.78      0.77      0.76        30



In [22]:
nb_pca_2d_cm = confusion_matrix(y_test, nb_pca_2d_pred)
plot_confusion_matrix(nb_pca_2d_cm, ['setosa','versicolor', 'virginica'])
