# Project 1: Principal Component Analysis

### Principal component analysis (PCA) is a dimensionality reduction and machine learning method used to simplify a large data set into a smaller set while still maintaining significant patterns and trends.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs
sns.set()

In [4]:
## Now lets import our PCA model package

from sklearn.decomposition import PCA

In [6]:
## First lets define our data

X = np.random.rand(200,50)

In [8]:
## Now lets build a PCA model
## We will build a model similar to how we have build models previously

pcaModel = PCA()
pcaModel.fit(X)

In [10]:
## Now if you type out "pcaModel." you can view all of the availiable parameters that PCA allows you to use to analyze data!
## Lets try out explained_variance_ratio_

pcaModel.explained_variance_ratio_

array([0.04446915, 0.04016937, 0.03900766, 0.03756747, 0.03472532,
       0.03344243, 0.03301668, 0.03207938, 0.03128983, 0.03051777,
       0.02953652, 0.02796888, 0.0267023 , 0.02579987, 0.02561051,
       0.02438182, 0.02419168, 0.02372092, 0.02317847, 0.02193323,
       0.02046194, 0.0199585 , 0.0197713 , 0.01872547, 0.0182916 ,
       0.01774673, 0.01707451, 0.01620256, 0.01594319, 0.0152651 ,
       0.01493623, 0.01466842, 0.0145116 , 0.01349054, 0.01319654,
       0.0128143 , 0.01226401, 0.01201179, 0.01138654, 0.0108798 ,
       0.01080589, 0.00993654, 0.00963203, 0.00936096, 0.00838318,
       0.007707  , 0.00709357, 0.0068303 , 0.00596223, 0.00537838])

In [14]:
## Now lets get the cumulative sum of those explained_variance_ratio_'s
## Notice how it goes to 1.0

pcaModel.explained_variance_ratio_.cumsum()

array([0.04446915, 0.08463851, 0.12364617, 0.16121364, 0.19593896,
       0.22938139, 0.26239807, 0.29447745, 0.32576729, 0.35628506,
       0.38582158, 0.41379046, 0.44049276, 0.46629263, 0.49190314,
       0.51628496, 0.54047664, 0.56419755, 0.58737602, 0.60930925,
       0.62977119, 0.64972969, 0.66950099, 0.68822646, 0.70651806,
       0.7242648 , 0.74133931, 0.75754187, 0.77348506, 0.78875015,
       0.80368638, 0.8183548 , 0.8328664 , 0.84635694, 0.85955348,
       0.87236778, 0.88463179, 0.89664358, 0.90803012, 0.91890992,
       0.92971581, 0.93965235, 0.94928437, 0.95864534, 0.96702852,
       0.97473552, 0.98182909, 0.98865939, 0.99462162, 1.        ])

In [18]:
## Now lets say we only want variance ratios that are above 0.95 from the cumulative sum
## Lets get the number of components needed to get to 0.95 and store the in a variable
## The 0.95 is the energy_value that we define
## The energy value is what we define as users of the model to be an acceptable amount of error. So there is a 5% error from .95

num_comps = np.where(pcaModel.explained_variance_ratio_.cumsum()>0.95)[0][0]

In [20]:
## So it took us a total of 43 componants 

num_comps

43

In [22]:
## We can find the total number of componants we originally had below

pcaModel.n_components_

50

In [32]:
## Lets define our own PCA function
## energy_value is for energy and it will be a value from 0-1
## X is for the dataset we want the model applied to
## We will define a PCA model similar to what we had done in previous labs 

## lets get the number of componants needed similar to above but now use the energy_value variable we defined for the function
## We will then define a second PCA model with the number of componants defined

## We will then transform the new model and assign to X2 and return that variable

def myPCA(energy_value, X):
    m = PCA()
    m.fit(X)
    num_comps = np.where(m.explained_variance_ratio_.cumsum()>energy_value)[0][0]
    
    m = PCA(n_components=num_comps)
    m.fit(X)
    X2 = m.transform(X)

    return X2

In [34]:
## Now lets generate some data and try our new function

X = np.random.rand(200,50)
X2 = myPCA(0.90,X)

In [38]:
## Now it is a numpy array

type(X2)

numpy.ndarray

In [40]:
## Now lets view the shape of X vs X2

X.shape

(200, 50)

In [44]:
## Notice how we have 37 compared to 50. 
## To have 90% energy defined we only needed 37 of the variables compared to all 50 originally

X2.shape

(200, 37)