In [1]:
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

Crear matriz de datos aleatorios

In [2]:
np.random.seed(2020)
mpoints= 200
X = (np.random.rand(4,4)@np.random.normal(0,1,(4,mpoints))).T

In [3]:
X.shape

(200, 4)

## PCA de scikit-learn¶

In [4]:
pca = PCA(n_components=2,svd_solver='full')
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='full', tol=0.0, whiten=False)

In [5]:
print(pca.components_)

[[ 0.65015527  0.27107152  0.49533738  0.50838886]
 [ 0.69968486 -0.01071332 -0.67627568 -0.23016849]]


In [6]:
print(pca.explained_variance_ratio_)

[0.89539294 0.09574124]


In [7]:
pca.singular_values_

array([28.25029514,  9.23773881])

In [8]:
z = pca.transform(X)

In [9]:
print(X)

[[ 1.16943645e+00  6.93612251e-01  1.16206093e+00  1.22330901e+00]
 [-2.38518819e+00 -7.83305652e-01 -1.58735045e+00 -1.58210421e+00]
 [ 6.71946851e-02  2.36728879e-01  2.98910214e-01  3.59356547e-01]
 [-3.20021224e-01  3.26666032e-01  4.09490058e-01  4.57771538e-01]
 [-5.89968825e-02 -9.81388453e-02 -5.01972914e-01 -3.61647107e-01]
 [ 1.83231219e+00  9.96343701e-01  1.43597091e+00  1.69089333e+00]
 [-1.87517211e-01 -3.24099680e-01  8.73581530e-01  1.87182844e-02]
 [-9.79758499e-01 -4.42504754e-02  2.07982836e-01 -6.48471533e-02]
 [-1.74318839e+00 -1.71141928e-01  2.22934336e-01 -2.46941565e-01]
 [-9.27907256e-02 -1.48633722e-03 -3.18972245e-01 -1.34912070e-01]
 [-1.92119992e+00 -7.26004522e-02  8.86438658e-01  1.36515783e-01]
 [-9.21131502e-01 -5.44126825e-01 -1.36432080e+00 -1.11365928e+00]
 [-2.20565429e+00 -6.88745071e-01 -9.32320817e-01 -1.16205992e+00]
 [-3.35371736e+00 -1.13662258e+00 -1.55994361e+00 -1.95388418e+00]
 [ 1.92904341e+00  7.70015310e-01  8.07186159e-01  1.19902959e

In [10]:
print(z)

[[ 2.26407147e+00 -2.24976455e-01]
 [-3.23546391e+00 -1.91193043e-01]
 [ 5.56820488e-01 -2.08721128e-01]
 [ 4.34256842e-01 -5.78048250e-01]
 [-3.79254309e-01  4.14142188e-01]
 [ 3.15049801e+00 -5.72774310e-02]
 [ 3.50673046e-01 -6.91162889e-01]
 [-4.60727306e-01 -7.79117898e-01]
 [-1.07664067e+00 -1.28011775e+00]
 [-1.69109171e-01  2.13515516e-01]
 [-6.42060069e-01 -1.94269706e+00]
 [-1.87013807e+00  5.71972066e-01]
 [-2.55509982e+00 -6.06250515e-01]
 [-4.13636557e+00 -7.98035585e-01]
 [ 2.59051851e+00  5.51272033e-01]
 [ 8.92232154e-01 -5.30427166e-01]
 [ 4.22250272e+00 -2.29942473e-01]
 [ 1.48777347e+00 -5.28999999e-01]
 [-2.91600475e+00 -5.94835648e-01]
 [ 1.31645197e+00  5.82635583e-01]
 [ 1.34546712e+00 -5.77885131e-01]
 [ 3.40576717e-01  6.17385895e-01]
 [ 5.08475167e-01 -8.56334244e-01]
 [ 1.72720882e+00 -6.12035791e-01]
 [ 2.62155985e+00 -2.52261962e-01]
 [ 3.51935021e+00 -2.80825403e-01]
 [-2.27863200e-02 -1.63476822e-01]
 [-1.66461597e+00  1.03027342e+00]
 [-4.48587661e+00 -3

Función para calcular PCA usando SVD

In [13]:
def PCA_from_SVD(A,num_componentes):
    """
    Función para PCA a partir de la SVD 
    params: A			matriz de datos
            num_componentes 	número de componentes deseados

    return: valores_singulares	Los valores singulares de la descomposición SVD
	    componentes		Los coeficientes para calcular los componentes principales
	    Z			Los datos transformados (componentes principales)
	    varianza_explicada	La varianza explicada por cada componente principal
    """
    
    # Centrar los datos
    A = np.array(A) # convertir los datos a un numpy array por si vienen de un DataFrame
    A_centered = A - A.mean(axis=0)
    
    # Modificar esta línea de código, mandar a llamar la función creada por el equipo 
    # Calcular SVD
    U, S, Vt = np.linalg.svd(A_centered)
    
    # Los valores singulares
    valores_singulares = S
    
    # Los componentes (coeficientes)
    componentes = (np.transpose(Vt))*-1
    
    # Los datos transformados (componentes principales)
    Z = A_centered@np.transpose(Vt)
    
    # La varianza explicada
    varianza_explicada = S**2/np.sum(S**2)
    
    # regresar 4 objetos
    return valores_singulares[:(num_componentes)], componentes[:(num_componentes)], Z[:,:(num_componentes)], varianza_explicada[:(num_componentes)]


Probar función

In [15]:
valores_ingulares, coeficientes, Z , varianza_explicada = PCA_from_SVD(X,2)

In [16]:
valores_ingulares

array([28.25029514,  9.23773881])

In [17]:
coeficientes

array([[ 0.65015527,  0.69968486,  0.29080087, -0.05633901],
       [ 0.27107152, -0.01071332, -0.70683553, -0.65329089]])

In [18]:
Z

array([[-2.26407147e+00,  2.24976455e-01],
       [ 3.23546391e+00,  1.91193043e-01],
       [-5.56820488e-01,  2.08721128e-01],
       [-4.34256842e-01,  5.78048250e-01],
       [ 3.79254309e-01, -4.14142188e-01],
       [-3.15049801e+00,  5.72774310e-02],
       [-3.50673046e-01,  6.91162889e-01],
       [ 4.60727306e-01,  7.79117898e-01],
       [ 1.07664067e+00,  1.28011775e+00],
       [ 1.69109171e-01, -2.13515516e-01],
       [ 6.42060069e-01,  1.94269706e+00],
       [ 1.87013807e+00, -5.71972066e-01],
       [ 2.55509982e+00,  6.06250515e-01],
       [ 4.13636557e+00,  7.98035585e-01],
       [-2.59051851e+00, -5.51272033e-01],
       [-8.92232154e-01,  5.30427166e-01],
       [-4.22250272e+00,  2.29942473e-01],
       [-1.48777347e+00,  5.28999999e-01],
       [ 2.91600475e+00,  5.94835648e-01],
       [-1.31645197e+00, -5.82635583e-01],
       [-1.34546712e+00,  5.77885131e-01],
       [-3.40576717e-01, -6.17385895e-01],
       [-5.08475167e-01,  8.56334244e-01],
       [-1.

In [19]:
varianza_explicada

array([0.89539294, 0.09574124])