# Analisis de Componentes Principales

In [29]:
import numpy as np
import pandas as pd
import chart_studio.plotly as py
from plotly.graph_objs import *
import chart_studio.tools as tls

In [2]:
df = pd.read_csv('./datasets/iris/iris.csv')

In [30]:
tls.set_credentials_file(username='LautaroSaez', api_key='POWUfGAnu35xdKDFYiOm')

In [3]:
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
x = df.iloc[:,0:4].values
y = df.iloc[:,4].values

In [5]:
x[0]

array([5.1, 3.5, 1.4, 0.2])

In [36]:
traces = []
legend = {
    0: True,
    1: True,
    2: True,
    3: True
}

colors = {
    'setosa': 'rgb(255,127,20)',
    'versicolor': 'rgb(31, 220, 123)',
    'virginica': 'rgb(44, 50, 180)'
}

for col in range(4):
    for key in colors:
        traces.append(Histogram(x=x[y==key, col], opacity=.7, xaxis='x%s'%(col+1), marker=Marker(color=colors[key]), name=key, showlegend=legend[col]))
    legend = {
    0: False,
    1: False,
    2: False,
    3: False
    } 
data = Data(traces)

layout = Layout( barmode='overlay', 
                 xaxis=XAxis(domain=[0, 0.25], title='logitud sepalos [cm]'),
                 xaxis2=XAxis(domain=[0.3, .5], title='ancho de sepalos [cm]'),
                 xaxis3=XAxis(domain=[0.55, .75], title='logitud de petalos [cm]'),
                 xaxis4=XAxis(domain=[0.8, 1], title='ancho de petalos [cm]'),
                 yaxis=YAxis(title="Número de ejemplares"),
                 title='Distribuciones'
               )
fig = Figure(data=data, layout=layout)
py.iplot(fig)

In [37]:
from sklearn.preprocessing import StandardScaler

In [39]:
x_std = StandardScaler().fit_transform(x)

In [41]:
traces = []
legend = {
    0: True,
    1: True,
    2: True,
    3: True
}

colors = {
    'setosa': 'rgb(255,127,20)',
    'versicolor': 'rgb(31, 220, 123)',
    'virginica': 'rgb(44, 50, 180)'
}

for col in range(4):
    for key in colors:
        traces.append(Histogram(x=x_std[y==key, col], opacity=.7, xaxis='x%s'%(col+1), marker=Marker(color=colors[key]), name=key, showlegend=legend[col]))
    legend = {
    0: False,
    1: False,
    2: False,
    3: False
    } 
data = Data(traces)

layout = Layout( barmode='overlay', 
                 xaxis=XAxis(domain=[0, 0.25], title='logitud sepalos [cm]'),
                 xaxis2=XAxis(domain=[0.3, .5], title='ancho de sepalos [cm]'),
                 xaxis3=XAxis(domain=[0.55, .75], title='logitud de petalos [cm]'),
                 xaxis4=XAxis(domain=[0.8, 1], title='ancho de petalos [cm]'),
                 yaxis=YAxis(title="Número de ejemplares"),
                 title='Distribuciones'
               )
fig = Figure(data=data, layout=layout)
py.iplot(fig)

## Calculamos la descomposicion en autovalores y autovectores
### Definimos la matriz de covarianza como: 
#### $\sigma_{jk} = \frac{1}{(n-1)}\sum_{i=1}^{m} (x_{ij}-\overline{x_j})(x_{ik}-\overline{x_k})$
 
#### $\Sigma=\frac{1}{n-1}(X-\overline{x})^T(X-\overline{x})$

### a) Usando matriz de covarianza

In [44]:
mean_vector = np.mean(x_std, axis=0)
mean_vector

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])

In [45]:
cov_matriz = (x_std - mean_vector).T.dot((x_std - mean_vector)) / (x_std.shape[0] - 1)
cov_matriz

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [49]:
eig_vals, eig_vec = np.linalg.eig(cov_matriz)
eig_vals, eig_vec

(array([2.93808505, 0.9201649 , 0.14774182, 0.02085386]),
 array([[ 0.52106591, -0.37741762, -0.71956635,  0.26128628],
        [-0.26934744, -0.92329566,  0.24438178, -0.12350962],
        [ 0.5804131 , -0.02449161,  0.14212637, -0.80144925],
        [ 0.56485654, -0.06694199,  0.63427274,  0.52359713]]))