In [26]:
# import pandas
import numpy as np

In [27]:
# generate a dummy set
data = np.random.randint(10, 50, 100).reshape(20, 5)

In [28]:
data

array([[42, 17, 49, 28, 19],
       [20, 36, 49, 30, 49],
       [20, 19, 15, 44, 43],
       [34, 22, 29, 43, 24],
       [18, 40, 15, 20, 21],
       [12, 40, 11, 42, 47],
       [43, 49, 26, 31, 11],
       [17, 36, 39, 25, 30],
       [43, 35, 17, 36, 19],
       [32, 13, 21, 15, 38],
       [10, 49, 39, 21, 27],
       [42, 42, 37, 36, 23],
       [20, 24, 41, 47, 35],
       [44, 32, 31, 12, 34],
       [23, 36, 10, 34, 10],
       [46, 26, 11, 42, 38],
       [13, 41, 20, 48, 22],
       [13, 23, 26, 35, 48],
       [29, 11, 29, 11, 25],
       [46, 40, 29, 49, 17]])

In [29]:
# find means for the data/dummy set
data_mean = np.mean(data, axis=0)

In [30]:
data_mean

array([28.35, 31.55, 27.2 , 32.45, 29.  ])

In [31]:
# mean centering the data
data_meaned = data - data_mean

In [32]:
data_meaned

array([[ 13.65, -14.55,  21.8 ,  -4.45, -10.  ],
       [ -8.35,   4.45,  21.8 ,  -2.45,  20.  ],
       [ -8.35, -12.55, -12.2 ,  11.55,  14.  ],
       [  5.65,  -9.55,   1.8 ,  10.55,  -5.  ],
       [-10.35,   8.45, -12.2 , -12.45,  -8.  ],
       [-16.35,   8.45, -16.2 ,   9.55,  18.  ],
       [ 14.65,  17.45,  -1.2 ,  -1.45, -18.  ],
       [-11.35,   4.45,  11.8 ,  -7.45,   1.  ],
       [ 14.65,   3.45, -10.2 ,   3.55, -10.  ],
       [  3.65, -18.55,  -6.2 , -17.45,   9.  ],
       [-18.35,  17.45,  11.8 , -11.45,  -2.  ],
       [ 13.65,  10.45,   9.8 ,   3.55,  -6.  ],
       [ -8.35,  -7.55,  13.8 ,  14.55,   6.  ],
       [ 15.65,   0.45,   3.8 , -20.45,   5.  ],
       [ -5.35,   4.45, -17.2 ,   1.55, -19.  ],
       [ 17.65,  -5.55, -16.2 ,   9.55,   9.  ],
       [-15.35,   9.45,  -7.2 ,  15.55,  -7.  ],
       [-15.35,  -8.55,  -1.2 ,   2.55,  19.  ],
       [  0.65, -20.55,   1.8 , -21.45,  -4.  ],
       [ 17.65,   8.45,   1.8 ,  16.55, -12.  ]])

In [33]:
# find covariance matrix for the centralised mean
cov_matrix = np.cov(data_meaned, rowvar=False)

In [34]:
cov_matrix

array([[171.81842105, -17.36052632,   6.55789474,  -3.95526316,
        -63.36842105],
       [-17.36052632, 130.57631579,  -3.85263158,  23.10789474,
        -44.36842105],
       [  6.55789474,  -3.85263158, 150.27368421, -30.09473684,
          6.47368421],
       [ -3.95526316,  23.10789474, -30.09473684, 144.26052632,
          3.78947368],
       [-63.36842105, -44.36842105,   6.47368421,   3.78947368,
        143.57894737]])

In [35]:
# find eigen vectors and eigen values
eigen_values, eigen_vectors = np.linalg.eigh(cov_matrix)

In [36]:
eigen_values

array([ 58.52949807, 118.27651836, 148.42067005, 190.07890459,
       225.20230367])

In [37]:
eigen_vectors

array([[ 0.44965415,  0.23215721, -0.39674586,  0.22860361, -0.73092222],
       [ 0.56658624, -0.05335457,  0.658934  , -0.46142336, -0.17037542],
       [-0.12073317,  0.59225836,  0.55220742,  0.57417818, -0.00631795],
       [-0.20303391,  0.73576242, -0.1922776 , -0.61647882,  0.02034904],
       [ 0.64883889,  0.2261416 , -0.25786622,  0.15843658,  0.66050808]])

In [38]:
# sort the eigenvalues in desending order
sort_index = np.argsort(eigen_values)[::-1]

In [39]:
sort_eigenvalues = eigen_values[sort_index]

In [40]:
sort_eigenvalues

array([225.20230367, 190.07890459, 148.42067005, 118.27651836,
        58.52949807])

In [41]:
# similarly sort the eigenvectors
sort_eigenvectors = eigen_vectors[:, sort_index]

In [42]:
sort_eigenvectors

array([[-0.73092222,  0.22860361, -0.39674586,  0.23215721,  0.44965415],
       [-0.17037542, -0.46142336,  0.658934  , -0.05335457,  0.56658624],
       [-0.00631795,  0.57417818,  0.55220742,  0.59225836, -0.12073317],
       [ 0.02034904, -0.61647882, -0.1922776 ,  0.73576242, -0.20303391],
       [ 0.66050808,  0.15843658, -0.25786622,  0.2261416 ,  0.64883889]])

In [43]:
# select the first n eigen vectors, n is desired dimension of our final reduced data
n_components = 2 # can select any number of components

In [44]:
eigenvector_subset = sort_eigenvectors[:, 0:n_components]

In [45]:
eigenvector_subset

array([[-0.73092222,  0.22860361],
       [-0.17037542, -0.46142336],
       [-0.00631795,  0.57417818],
       [ 0.02034904, -0.61647882],
       [ 0.66050808,  0.15843658]])

In [46]:
# transform the data
data_reduce = np.dot(eigenvector_subset.transpose(), data_meaned.transpose()).transpose()

In [47]:
data_reduce

array([[-14.33149142,  23.51019843],
       [ 18.36760505,  13.23401495],
       [ 17.80063566,  -8.02516895],
       [ -5.60185563,  -0.56431025],
       [  0.66504145,  -6.86237987],
       [ 22.69673571, -19.97389719],
       [-25.5921316 ,  -7.34977283],
       [  7.97215241,   6.87852139],
       [-17.76420428,  -7.8723508 ],
       [  6.12125118,  18.01738654],
       [  8.81080708,   1.27039795],
       [-15.71023672,   0.78639195],
       [ 11.56147422,   1.47941776],
       [ -8.65320735,  18.95105786],
       [ -9.25718054, -17.11806518],
       [ -5.71393658,  -7.16737665],
       [  5.34796862, -22.69890082],
       [ 25.28549106,   1.18536461],
       [ -0.06377624,  23.2540876 ],
       [-21.94114208, -10.9346165 ]])