### Sample of principal component analysis for a normal tabular dataset.

In [41]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

In [42]:
# Dataset
data = fetch_california_housing()
data = pd.DataFrame(data.data, columns = data.feature_names)

# Scaling
scaler = StandardScaler()
scaler.fit(data)
scaled_data = pd.DataFrame(scaler.transform(data), columns = [i+'_scaled' for i in data.columns])

# Indexing
data['Index'] = [i for i in range(data.shape[0])]
scaled_data['Index'] = [i for i in range(scaled_data.shape[0])]

# Merge
analysis_data = pd.merge(data, scaled_data, on = 'Index', how = 'inner')
analysis_data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Index,MedInc_scaled,HouseAge_scaled,AveRooms_scaled,AveBedrms_scaled,Population_scaled,AveOccup_scaled,Latitude_scaled,Longitude_scaled
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,2,1.782699,1.856182,1.155620,-0.049016,-0.820777,-0.025843,1.038503,-1.332827
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,20635,-1.216128,-0.289187,-0.155023,0.077354,-0.512592,-0.049110,1.801647,-0.758826
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,20636,-0.691593,-0.845393,0.276881,0.462365,-0.944405,0.005021,1.806329,-0.818722
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,20637,-1.142593,-0.924851,-0.090318,0.049414,-0.369537,-0.071735,1.778237,-0.823713
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,20638,-1.054583,-0.845393,-0.040211,0.158778,-0.604429,-0.091225,1.778237,-0.873626


In [155]:
# PCA fitting
feature_names = [i for i in analysis_data.columns if '_scaled' in i]
pca = PCA()
pca.fit(analysis_data[feature_names].values)

# Principal component score
PC_score_data = pd.DataFrame(pca.transform(analysis_data[feature_names].values), columns = [f'PC{i+1}' for i in range(len(feature_names))])
PC_score_data['Index'] = [i for i in range(PC_score_data.shape[0])] # Merge "PC_score_data" and "analysis_data" using the "Index" column as key

# Contribution ratio
Contribution_data = pd.DataFrame(pca.explained_variance_ratio_, index = [f'PC{i+1}' for i in range(len(feature_names))], columns = ['Contribution_ratio'])
Contribution_data['Cumulative_Contribution_ratio'] = np.cumsum(Contribution_data['Contribution_ratio'].values)
Contribution_data.reset_index(names = 'Principal_component', inplace = True)

# Eigenvector
Eigenvector_data = pd.DataFrame(pca.components_.T, columns = [f'PC{i+1}' for i in range(len(feature_names))], index = feature_names)
Eigenvector_data.reset_index(names = 'feature_name', inplace = True)

# Loading (Based on scaling of features)
Loading_data = pd.DataFrame((pca.components_*(np.sqrt(pca.explained_variance_).reshape(-1, 1))).T, index = feature_names, columns = [f'PC{i+1}' for i in range(len(feature_names))])
Loading_data.reset_index(names = 'feature_name', inplace = True)

In [157]:
PC_score_data

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,Index
0,1.882704,-0.503362,-0.314134,-2.555093,0.128040,0.485177,0.091226,-0.137812,0
1,1.371120,-0.121406,1.905258,-1.872984,-0.267430,0.605123,0.263086,-0.182221,1
2,2.086868,-0.501136,-0.937245,-2.100133,0.370032,1.158676,-0.295159,0.169800,2
3,1.575801,-1.239495,-1.025957,-1.286721,0.318956,0.977902,0.091039,-0.080535,3
4,1.591206,-1.345264,-1.249171,-0.450778,0.293612,0.865664,-0.159976,0.227928,4
...,...,...,...,...,...,...,...,...,...
20635,1.386779,-1.296137,0.054262,1.063026,-0.163868,-0.738947,-0.470517,-0.386192,20635
20636,1.875219,-0.671533,0.155700,0.644612,-0.208383,-1.262269,-0.321725,-0.401329,20636
20637,1.402357,-1.096426,0.567135,1.090397,-0.326347,-1.052189,-0.420289,-0.290166,20637
20638,1.542943,-1.059408,0.365472,0.968799,-0.323242,-1.121581,-0.349867,-0.304425,20638


In [159]:
Contribution_data

Unnamed: 0,Principal_component,Contribution_ratio,Cumulative_Contribution_ratio
0,PC1,0.253369,0.253369
1,PC2,0.235162,0.488531
2,PC3,0.158886,0.647417
3,PC4,0.12888,0.776297
4,PC5,0.125382,0.901679
5,PC6,0.082423,0.984102
6,PC7,0.0102,0.994302
7,PC8,0.005698,1.0


In [121]:
Eigenvector_data

Unnamed: 0,feature_name,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8
0,MedInc_scaled,0.077462,0.229879,0.218222,-0.885294,-0.002603,0.149818,0.169238,-0.242271
1,HouseAge_scaled,0.007318,-0.260786,-0.627551,-0.109898,0.222936,0.683931,-0.081035,-0.044846
2,AveRooms_scaled,0.439142,0.546687,-0.063999,-0.046402,0.029852,0.095054,-0.530662,0.458814
3,AveBedrms_scaled,0.399996,0.50185,-0.191848,0.338144,0.050922,0.086854,0.531566,-0.379875
4,Population_scaled,-0.171783,0.100626,0.650375,0.283207,0.008078,0.675226,-0.031676,-0.01317
5,AveOccup_scaled,-0.014644,0.002621,0.157727,0.004918,0.972669,-0.16941,0.003246,0.008959
6,Latitude_scaled,0.568364,-0.374011,0.1926,0.062214,-0.022761,-0.042069,-0.431797,-0.554415
7,Longitude_scaled,-0.536974,0.422387,-0.180824,0.059846,0.011807,-0.082551,-0.461698,-0.526165


In [160]:
Loading_data

Unnamed: 0,feature_name,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8
0,MedInc_scaled,0.110286,0.31531,0.246036,-0.89895,-0.002607,0.121659,0.048346,-0.051728
1,HouseAge_scaled,0.01042,-0.357704,-0.707535,-0.111593,0.223282,0.555381,-0.023149,-0.009575
2,AveRooms_scaled,0.625227,0.749856,-0.072156,-0.047117,0.029898,0.077188,-0.151593,0.097962
3,AveBedrms_scaled,0.569491,0.688356,-0.2163,0.34336,0.051001,0.070529,0.151851,-0.081108
4,Population_scaled,-0.244575,0.138022,0.733268,0.287575,0.00809,0.548311,-0.009049,-0.002812
5,AveOccup_scaled,-0.02085,0.003595,0.17783,0.004994,0.974178,-0.137568,0.000927,0.001913
6,Latitude_scaled,0.809206,-0.513007,0.217147,0.063174,-0.022796,-0.034162,-0.12335,-0.118374
7,Longitude_scaled,-0.764514,0.579362,-0.203871,0.06077,0.011825,-0.067035,-0.131892,-0.112342
