<a href="https://colab.research.google.com/github/MHZur/Diplomado_Publico/blob/main/Tareas/Tarea_pca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Se escoje el algoritmo de regresión logistica con el dataset de Iris.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA

In [None]:
def plot_confusion_matrix(cm, labels):
    fig_cm = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count"),
                       x=labels, y=labels, color_continuous_scale='Viridis', text_auto = True,
                       title="Confusion Matrix")
    fig_cm.update_layout(coloraxis_showscale=False)
    fig_cm.show()

In [None]:
### Ejercicio con base de datos Iris
df_iris = pd.read_csv("iris.csv")

In [None]:
df_iris.head(5)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
df_iris["Species"].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [None]:
# Separamos los conjuntos con los datos y el
X = df_iris.iloc[:, :-1].values
y = df_iris.iloc[:, 4].values

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## PCA reduciendo a dos componentes principales

In [None]:
pca_2d = PCA(n_components = 2)
X_pca_2d = pca_2d.fit_transform(X_scaled)
pca_df_2d = pd.DataFrame(data = X_pca_2d, columns = ['PC1', 'PC2'])
pca_df_2d['Species'] = y
pca_df_2d.head()

Unnamed: 0,PC1,PC2,Species
0,-2.264703,0.480027,setosa
1,-2.080961,-0.674134,setosa
2,-2.364229,-0.341908,setosa
3,-2.299384,-0.597395,setosa
4,-2.389842,0.646835,setosa


In [None]:
fig_2d = px.scatter(pca_df_2d, x='PC1', y='PC2',color='Species', template = 'plotly_white', title = 'PCA with 2 Components')
fig_2d.show()

In [None]:
explained_variance_2d = pca_2d.explained_variance_ratio_
print("Explained Variance Ratio (2D):", explained_variance_2d)

#La primer componente explica el 73% de la varianza total, las dos componentes explican el 95.8 % de la varianza.

Explained Variance Ratio (2D): [0.72962445 0.22850762]


In [None]:
components = pd.DataFrame(pca_2d.components_, columns=df_iris.columns[:-1])
components

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,0.521066,-0.269347,0.580413,0.564857
1,0.377418,0.923296,0.024492,0.066942


### Dividimos los conjuntos para el entrenamiento y la prueba

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_iris.iloc[:, :-1], df_iris['Species'], test_size=0.2, random_state=7)
#Se cambia el parámetro random_state para tomar valores diferentes en los conjuntos de entrenamiento y test

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
pca = PCA(n_components=2)
X_train_pca_2d = pca.fit_transform(X_train_scaled)
X_test_pca_2d = pca.transform(X_test_scaled)

### Aplicamos el algoritmo de regresión logística

In [None]:
lr_pca_2d = LogisticRegression()
lr_pca_2d.fit(X_train_pca_2d, y_train)
lr_pca_2d_pred = lr_pca_2d.predict(X_test_pca_2d)

In [None]:
lr_pca_2d_accuracy = accuracy_score(y_test, lr_pca_2d_pred)
lr_pca_2d_precision = precision_score(y_test, lr_pca_2d_pred, average = 'micro')
lr_pca_2d_recall = recall_score(y_test, lr_pca_2d_pred, average = 'micro')
lr_pca_2d_f1 = f1_score(y_test, lr_pca_2d_pred, average = 'micro')
lr_pca_2d_report = classification_report(y_test, lr_pca_2d_pred)
print("Logistic Regression PCA 2D Classification Report:")
print(lr_pca_2d_report)

Logistic Regression PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.67      0.83      0.74        12
   virginica       0.75      0.55      0.63        11

    accuracy                           0.77        30
   macro avg       0.81      0.79      0.79        30
weighted avg       0.78      0.77      0.76        30



### Ploteamos la matriz de confusión e imprimimos el accuracy.

In [None]:
lr_pca_2d_cm = confusion_matrix(y_test, lr_pca_2d_pred)
plot_confusion_matrix(lr_pca_2d_cm, ['setosa', 'versicolor', 'virginica'])
print(f"accuracy: {lr_pca_2d_accuracy}")

accuracy: 0.7666666666666667


### Repetimos el ejercicio con el mismo algoritmo modificando el parámetro random_state para tomar valores diferentes en los conjuntos de entrenamiento y test y ver que tanto afecta el resultado.

In [None]:
# TOMAMOS random_state = 1
X_train, X_test, y_train, y_test = train_test_split(df_iris.iloc[:, :-1], df_iris['Species'], test_size=0.2, random_state = 1)

# Reescalamos los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=2)
X_train_pca_2d = pca.fit_transform(X_train_scaled)
X_test_pca_2d = pca.transform(X_test_scaled)

lr_pca_2d = LogisticRegression()
lr_pca_2d.fit(X_train_pca_2d, y_train)
lr_pca_2d_pred = lr_pca_2d.predict(X_test_pca_2d)

lr_pca_2d_accuracy = accuracy_score(y_test, lr_pca_2d_pred)
lr_pca_2d_precision = precision_score(y_test, lr_pca_2d_pred, average = 'micro')
lr_pca_2d_recall = recall_score(y_test, lr_pca_2d_pred, average = 'micro')
lr_pca_2d_f1 = f1_score(y_test, lr_pca_2d_pred, average = 'micro')
lr_pca_2d_report = classification_report(y_test, lr_pca_2d_pred)
print("Logistic Regression PCA 2D Classification Report:")
print(lr_pca_2d_report)

lr_pca_2d_cm = confusion_matrix(y_test, lr_pca_2d_pred)
plot_confusion_matrix(lr_pca_2d_cm, ['setosa', 'versicolor', 'virginica'])
print(f"accuracy: {lr_pca_2d_accuracy}")

Logistic Regression PCA 2D Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        11
  versicolor       1.00      0.92      0.96        13
   virginica       0.86      1.00      0.92         6

    accuracy                           0.97        30
   macro avg       0.95      0.97      0.96        30
weighted avg       0.97      0.97      0.97        30



accuracy: 0.9666666666666667


### Observamos que cambiando los datos de entrenamiento y test la precisión "mejoró" un 20%. ¿Cuántas pruebas son necesarias para determinar la precisión de nuestro modelo?