In [18]:
# Import packages
import numpy as np
import pandas as pd
from sklearn import decomposition, datasets
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Load features

In [19]:
# Load the breast cancer dataset
dataset = datasets.load_diabetes()

# Load the features
X = dataset.data

In [20]:
# View the shape of the dataset
X.shape

(442, 10)

In [21]:
# View the data
X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

In [29]:
diabetes = pd.DataFrame(X)

In [30]:
diabetes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [33]:
diabetes.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-3.634285e-16,1.308343e-16,-8.045349e-16,1.281655e-16,-8.835316000000001e-17,1.327024e-16,-4.574646e-16,3.777301e-16,-3.830854e-16,-3.412882e-16
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123996,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260974,-0.1377672
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665645,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324879,-0.03317903
50%,0.00538306,-0.04464164,-0.007283766,-0.005670611,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947634,-0.001077698
75%,0.03807591,0.05068012,0.03124802,0.03564384,0.02835801,0.02984439,0.0293115,0.03430886,0.03243323,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320442,0.1539137,0.198788,0.1811791,0.1852344,0.133599,0.1356118


# Standardize Features

In machine learning, we can handle various types of data, e.g. audio signals and pixel values for image data, and this data can include multiple dimensions. Feature standardization makes the values of each feature in the data have zero-mean (when subtracting the mean in the numerator) and unit-variance. This method is widely used for normalization in many machine learning algorithms (e.g., support vector machines, logistic regression, and artificial neural networks) The general method of calculation is to determine the distribution mean and standard deviation for each feature. Next we subtract the mean from each feature. Then we divide the values (mean is already subtracted) of each feature by its standard deviation.

In [34]:
# Create a scaler object
sc = StandardScaler()

# Fit the scaler to the features and transform
X_std = sc.fit_transform(X)

In [35]:
X_std

array([[ 0.80050009,  1.06548848,  1.29708846, ..., -0.05449919,
         0.41855058, -0.37098854],
       [-0.03956713, -0.93853666, -1.08218016, ..., -0.83030083,
        -1.43655059, -1.93847913],
       [ 1.79330681,  1.06548848,  0.93453324, ..., -0.05449919,
         0.06020733, -0.54515416],
       ...,
       [ 0.87686984,  1.06548848, -0.33441002, ..., -0.23293356,
        -0.98558469,  0.32567395],
       [-0.9560041 , -0.93853666,  0.82123474, ...,  0.55838411,
         0.93615545, -0.54515416],
       [-0.9560041 , -0.93853666, -1.53537419, ..., -0.83030083,
        -0.08871747,  0.06442552]])

In [39]:
diabetes = pd.DataFrame(X_std)

In [40]:
diabetes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.8005,1.065488,1.297088,0.45984,-0.929746,-0.732065,-0.912451,-0.054499,0.418551,-0.370989
1,-0.039567,-0.938537,-1.08218,-0.553511,-0.177624,-0.402886,1.564414,-0.830301,-1.436551,-1.938479
2,1.793307,1.065488,0.934533,-0.119218,-0.958674,-0.718897,-0.680245,-0.054499,0.060207,-0.545154
3,-1.872441,-0.938537,-0.243771,-0.770658,0.256292,0.525397,-0.757647,0.721302,0.477072,-0.196823
4,0.113172,-0.938537,-0.764944,0.45984,0.082726,0.32789,0.171178,-0.054499,-0.672582,-0.980568


In [41]:
diabetes.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-9.544904e-18,-8.389468e-17,2.4113440000000002e-17,2.0596900000000002e-17,-5.927888e-17,-5.450642000000001e-17,5.325052000000001e-17,2.717786e-16,2.951385e-18,-2.0251520000000003e-17
std,1.001133,1.001133,1.001133,1.001133,1.001133,1.001133,1.001133,1.001133,1.001133,1.001133
min,-2.25429,-0.9385367,-1.897929,-2.363066,-2.665411,-2.430626,-2.150883,-1.606102,-2.651046,-2.89639
25%,-0.7841722,-0.9385367,-0.7196249,-0.7706577,-0.7200196,-0.6382488,-0.738296,-0.8303008,-0.6990157,-0.6975491
50%,0.1131724,-0.9385367,-0.1531324,-0.1192178,-0.090841,-0.08029125,-0.1384305,-0.05449919,-0.04094666,-0.02265729
75%,0.8005001,1.065488,0.6569519,0.7493688,0.5961931,0.6274425,0.616239,0.7213025,0.6818695,0.5869224
max,2.327895,1.065488,3.585718,2.776071,3.235851,4.179278,3.809072,3.894331,2.808758,2.851075


# Conduct PCA

In [42]:
# Create a pca object with the 2 components as a parameter
pca = decomposition.PCA(n_components=2)

# Fit the PCA and transform the data
X_std_pca = pca.fit_transform(X_std)

In [43]:
# View the new feature data's shape
X_std_pca.shape

(442, 2)

In [44]:
# View the new feature data
X_std_pca

array([[ 5.87207673e-01, -1.94682793e+00],
       [-2.83161209e+00,  1.37208454e+00],
       [ 2.72147566e-01, -1.63489803e+00],
       [ 4.93100479e-02,  3.82253328e-01],
       [-7.56450708e-01,  8.11967536e-01],
       [-3.96635524e+00, -3.81059274e-01],
       [-1.99378667e+00, -8.05538307e-01],
       [ 2.07586704e+00,  1.82792114e+00],
       [ 6.03032586e-01, -8.81252657e-01],
       [-2.12152622e-01, -4.92904308e-01],
       [-3.85617243e+00, -1.53208409e+00],
       [ 3.96922395e-01, -6.32550732e-01],
       [-1.59638555e+00,  9.81472208e-01],
       [ 3.18454823e-01, -6.51091168e-01],
       [-1.68546709e+00,  1.81794392e+00],
       [ 2.95359300e+00,  9.50059540e-01],
       [ 1.59486580e-01,  9.12321240e-01],
       [ 2.28821309e+00, -1.48600238e-01],
       [-1.15619944e+00, -4.28308767e-01],
       [-1.77157301e+00,  9.28845591e-01],
       [-2.00978128e+00, -2.90005193e-01],
       [-1.93149763e+00, -8.31330715e-01],
       [-1.67616004e+00,  7.19678421e-01],
       [ 3.

In [45]:
diabetes = pd.DataFrame(X_std_pca)

In [46]:
diabetes.head()

Unnamed: 0,0,1
0,0.587208,-1.946828
1,-2.831612,1.372085
2,0.272148,-1.634898
3,0.04931,0.382253
4,-0.756451,0.811968
