# Analisis de Componentes Principales

In [2]:
import numpy as np
import pandas as pd
import chart_studio.plotly as py
from plotly.graph_objs import *
import chart_studio.tools as tls

In [3]:
df = pd.read_csv('./datasets/iris/iris.csv')

In [4]:
tls.set_credentials_file(username='LautaroSaez', api_key='POWUfGAnu35xdKDFYiOm')

In [5]:
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
x = df.iloc[:,0:4].values
y = df.iloc[:,4].values

In [7]:
x[0]

array([5.1, 3.5, 1.4, 0.2])

In [8]:
traces = []
legend = {
    0: True,
    1: True,
    2: True,
    3: True
}

colors = {
    'setosa': 'rgb(255,127,20)',
    'versicolor': 'rgb(31, 220, 123)',
    'virginica': 'rgb(44, 50, 180)'
}

for col in range(4):
    for key in colors:
        traces.append(Histogram(x=x[y==key, col], opacity=.7, xaxis='x%s'%(col+1), marker=Marker(color=colors[key]), name=key, showlegend=legend[col]))
    legend = {
    0: False,
    1: False,
    2: False,
    3: False
    } 
data = Data(traces)

layout = Layout( barmode='overlay', 
                 xaxis=XAxis(domain=[0, 0.25], title='logitud sepalos [cm]'),
                 xaxis2=XAxis(domain=[0.3, .5], title='ancho de sepalos [cm]'),
                 xaxis3=XAxis(domain=[0.55, .75], title='logitud de petalos [cm]'),
                 xaxis4=XAxis(domain=[0.8, 1], title='ancho de petalos [cm]'),
                 yaxis=YAxis(title="Número de ejemplares"),
                 title='Distribuciones'
               )
fig = Figure(data=data, layout=layout)
py.iplot(fig)


plotly.graph_objs.Marker is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Marker
  - plotly.graph_objs.histogram.selected.Marker
  - etc.



plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.



plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis



plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis




In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
x_std = StandardScaler().fit_transform(x)

In [11]:
traces = []
legend = {
    0: True,
    1: True,
    2: True,
    3: True
}

colors = {
    'setosa': 'rgb(255,127,20)',
    'versicolor': 'rgb(31, 220, 123)',
    'virginica': 'rgb(44, 50, 180)'
}

for col in range(4):
    for key in colors:
        traces.append(Histogram(x=x_std[y==key, col], opacity=.7, xaxis='x%s'%(col+1), marker=Marker(color=colors[key]), name=key, showlegend=legend[col]))
    legend = {
    0: False,
    1: False,
    2: False,
    3: False
    } 
data = Data(traces)

layout = Layout( barmode='overlay', 
                 xaxis=XAxis(domain=[0, 0.25], title='logitud sepalos [cm]'),
                 xaxis2=XAxis(domain=[0.3, .5], title='ancho de sepalos [cm]'),
                 xaxis3=XAxis(domain=[0.55, .75], title='logitud de petalos [cm]'),
                 xaxis4=XAxis(domain=[0.8, 1], title='ancho de petalos [cm]'),
                 yaxis=YAxis(title="Número de ejemplares"),
                 title='Distribuciones'
               )
fig = Figure(data=data, layout=layout)
py.iplot(fig)


plotly.graph_objs.Marker is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Marker
  - plotly.graph_objs.histogram.selected.Marker
  - etc.



plotly.graph_objs.Data is deprecated.
Please replace it with a list or tuple of instances of the following types
  - plotly.graph_objs.Scatter
  - plotly.graph_objs.Bar
  - plotly.graph_objs.Area
  - plotly.graph_objs.Histogram
  - etc.



plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis



plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis




## Calculamos la descomposicion en autovalores y autovectores
### Definimos la matriz de covarianza como: 
#### $\sigma_{jk} = \frac{1}{(n-1)}\sum_{i=1}^{m} (x_{ij}-\overline{x_j})(x_{ik}-\overline{x_k})$
 
#### $\Sigma=\frac{1}{n-1}(X-\overline{x})^T(X-\overline{x})$

### a) Usando matriz de covarianza

In [12]:
mean_vector = np.mean(x_std, axis=0)
mean_vector

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])

In [13]:
cov_matriz = (x_std - mean_vector).T.dot((x_std - mean_vector)) / (x_std.shape[0] - 1)
cov_matriz

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [14]:
eig_vals, eig_vec = np.linalg.eig(cov_matriz)
eig_vals, eig_vec

(array([2.93808505, 0.9201649 , 0.14774182, 0.02085386]),
 array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
        [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
        [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
        [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]]))

### b) Correlacion

In [15]:
corr_matriz = np.corrcoef(x_std.T)
corr_matriz

array([[ 1.        , -0.11756978,  0.87175378,  0.81794113],
       [-0.11756978,  1.        , -0.4284401 , -0.36612593],
       [ 0.87175378, -0.4284401 ,  1.        ,  0.96286543],
       [ 0.81794113, -0.36612593,  0.96286543,  1.        ]])

In [21]:
eig_vals, eig_vect = np.linalg.eig(corr_matriz)
eig_vals, eig_vect

(array([2.91849782, 0.91403047, 0.14675688, 0.02071484]),
 array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
        [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
        [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
        [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]]))

### c) singluar value Decomposition

In [20]:
u, s, v = np.linalg.svd(x_std.T)
u, s

(array([[-0.52106591, -0.37741762,  0.71956635,  0.26128628],
        [ 0.26934744, -0.92329566, -0.24438178, -0.12350962],
        [-0.5804131 , -0.02449161, -0.14212637, -0.80144925],
        [-0.56485654, -0.06694199, -0.63427274,  0.52359713]]),
 array([20.92306556, 11.7091661 ,  4.69185798,  1.76273239]))

## 2- Seleccionar las componentes principales

In [22]:
for ev in eig_vect:
    print('La longitud del autovector es: %s'%(np.linalg.norm(ev)))

La longitud del autovector es: 0.9999999999999996
La longitud del autovector es: 1.0
La longitud del autovector es: 0.9999999999999999
La longitud del autovector es: 1.0000000000000002


In [27]:
eig_pairs = [([(np.abs(eig_vals[i])), eig_vect[:,i]]) for i in range(len(eig_vect))]
eig_pairs

[[2.918497816531996,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])],
 [0.9140304714680699,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])],
 [0.1467568755713154,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])],
 [0.020714836428619078,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713])]]

In [29]:
for ep in eig_pairs:
    print('autovalor: %s\tautovector: %s'%(ep[0], ep[1]))

autovalor: 2.918497816531996	autovector: [ 0.52106591 -0.26934744  0.5804131   0.56485654]
autovalor: 0.9140304714680699	autovector: [-0.37741762 -0.92329566 -0.02449161 -0.06694199]
autovalor: 0.1467568755713154	autovector: [-0.71956635  0.24438178  0.14212637  0.63427274]
autovalor: 0.020714836428619078	autovector: [ 0.26128628 -0.12350962 -0.80144925  0.52359713]


In [34]:
total_sum = sum(eig_vals)
var_exp = [i/total_sum*100 for i in eig_vals]
print('La varianza explicativa es %s'%(var_exp))
cum_var_sum = np.cumsum(var_exp)

La varianza explicativa es [72.9624454132999, 22.85076178670175, 3.6689218892828848, 0.517870910715477]


In [39]:
plot1 = Bar(x=['CP %s'%i for i in range(1,5)], y=var_exp, showlegend=False)
plot2 = Scatter(x=['CP %s'%i for i in range(1,5)], y=cum_var_sum, showlegend=True, name='Porcentaje de Varianza acumulada explicada'
        )
data = Data([plot1, plot2])

layout = Layout(xaxis= XAxis(title='Componentes principales'),
                yaxis= YAxis(title='Proncentaje de varianza explicada'),
               title='Porcentaje de variabilidad explicada por cada componente principal')

fig = Figure(data = data, layout = layout)

py.iplot(fig)




## 3- Construccion de la matriz de proyeccion

In [42]:
w = np.hstack([eig_pairs[0][1].reshape(4,1)
              ,eig_pairs[1][1].reshape(4,1)])
w

array([[ 0.52106591, -0.37741762],
       [-0.26934744, -0.92329566],
       [ 0.5804131 , -0.02449161],
       [ 0.56485654, -0.06694199]])

$Y = X \cdot W, x \in M(\mathbb R)_{150,4}, W \in M(\mathbb R)_{4,2}, Y \in M(\mathbb R)_{150,2}$

In [52]:
Y = x_std.dot(w)
results = []

for name in ['setosa', 'virginica', 'versicolor']:
    results.append( Scatter(x=Y[name==y, 0], y=Y[y==name,1], mode='markers', name=name, 
                            marker=Marker(size=12, line=Line(color='rgba(220, 220, 220, .15)', width=.5)),
                           opacity=.7) )
data = Data(results)
layout = Layout( showlegend=True, scene=Scene(xaxis=XAxis(title='Componente principal 1'),
                                          yaxis=YAxis(title='Componente principal 2')))

fig = Figure(data=data, layout=layout)
py.iplot(fig)

(150, 2)