# Principal component analysis with scikit-learn

Use a principal component analysis on a dataset with 13 features

We'll use the "wine" dataset 

In [None]:
# import standard stuff
import matplotlib.pyplot as plt
import numpy as np

# Import train_test_split function
from sklearn.model_selection import train_test_split

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

#Import scikit-learn MLP classifier
from sklearn.neural_network import MLPClassifier 

#Import datasets so we can access the "digits" dataset
from sklearn import datasets

#import the wine dataset into a pandas dataframe
wine_data = datasets.load_wine(as_frame=True)
df = wine_data.data
print(df.info())
print("Shape is ",df.shape)

"wine" data are the results of a chemical analysis of wines grown in the same region in Italy. The analysis determined the quantities of __13 constituents__ found in each of __three__ types of wines. There are __178__ samples

In [None]:
for col in df.columns:
    x = df[col]
    x.plot.hist()
    plt.xlabel(col)
    plt.show()

All the features have a very different range of values. To make the task of the PCA easier, we can preprocess the data 

In [None]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()

# standard scaler transforms the features such that the mean is 0 and the variance is 1
X = std_scaler.fit_transform(df)

i=0
for col in df.columns:
    plt.hist(X[:,i])
    print(f"mean, variance: {np.mean(X[:,i]):.3} , {np.var(X[:,i]):.3}")
    plt.xlabel(col)
    plt.show()
    i = i+1

In [None]:
# import the PCA method from sklearn
# this will do all the work for us

from sklearn.decomposition import PCA
# we can set the number of PCA components we want to reduce the data set to
pca = PCA(n_components=3)
# there are two steps
# first, we find the principal components
pca.fit(X)

# we can then also transform the input data into 
# a representation by the three components only
Xpca = pca.transform(X)

print(np.shape(X))
print(np.shape(Xpca))

In [None]:
# The three PCA components represent the directions of maximum variance
# in the 13-dimensional feature space
print(pca.components_)

In [None]:
# How well do the PCA components explain the variation in the original data
print(f"PCA explains {sum(pca.explained_variance_ratio_):.3} of the data variance")


How does the explained variance change with the number of components we allow?

In [None]:
# get an array of numbers from 1 to 14
nums = np.arange(14)
print(nums)

var_ratio = []
for num in nums:
  pca = PCA(n_components=num)

# we don't need to do the transform step every time
  pca.fit(X)

# record the total variance explained for each number of components
  var_ratio.append(np.sum(pca.explained_variance_ratio_))
    
#plot explained variance vs number of components
plt.figure(figsize=(4,4))
plt.grid()
plt.plot(nums,var_ratio,marker='o')
plt.xlabel('n_components')
plt.ylabel('Explained variance ratio')
plt.title('n_components vs. Explained Variance Ratio')