In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('../data/cleaned_data.csv')

In [3]:
df.shape

(3900, 103)

In [4]:
# Applying PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance
principal_components = pca.fit_transform(df)

In [5]:
principal_components

array([[ 1.76192657e+00,  7.03783877e-01,  9.23447540e-01, ...,
         3.30350096e-01, -1.33935131e-01, -8.96089803e-02],
       [ 1.76132760e+00,  7.08913671e-01,  9.00501195e-01, ...,
        -1.78174063e-01, -3.71986956e-01,  9.28746809e-02],
       [ 2.76021407e+00, -1.31587203e+00,  8.57212157e-01, ...,
         1.80048728e-03, -4.85890855e-02, -3.25838320e-02],
       ...,
       [-1.25060294e+00,  6.81703537e-01, -7.54516338e-01, ...,
        -3.47091479e-02,  3.02589593e-02,  4.99312083e-04],
       [ 2.76196642e+00, -1.29906225e+00, -8.01694865e-01, ...,
         3.79329935e-02,  2.46448758e-02,  2.76227538e-02],
       [-1.25002529e+00, -3.53103828e-01, -7.56242695e-01, ...,
        -7.67291516e-03, -7.11619730e-03,  1.75402364e-02]])

In [6]:
# Extracting PCA Loadings
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

# Create a DataFrame for loadings
df_loadings = pd.DataFrame(loadings, index=df.columns)

# Calculate the contribution of each feature to each principal component
contributions = (df_loadings ** 2) / np.sum(df_loadings ** 2, axis=0)
contributions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
Size,7.629023e-05,9.963908e-01,0.000410,0.000289,3.145447e-04,0.000502,2.912706e-05,7.835328e-06,0.000018,0.000186,...,2.052495e-05,4.493253e-06,5.149150e-08,7.738071e-06,2.966949e-07,1.132600e-07,0.000026,1.850718e-06,3.590041e-07,0.000008
Frequency of Purchases,9.993744e-01,7.176369e-05,0.000020,0.000002,9.589926e-07,0.000111,8.729353e-07,2.151835e-07,0.000032,0.000001,...,1.395930e-06,3.876193e-07,9.507600e-08,2.861695e-09,5.479542e-07,1.097012e-09,0.000001,9.042280e-08,9.761328e-07,0.000002
Age,1.628889e-06,7.630045e-09,0.000011,0.000180,6.571559e-04,0.000032,2.003309e-04,8.446813e-04,0.000017,0.000661,...,8.354199e-06,1.008907e-05,1.436648e-04,2.820534e-04,1.795552e-05,4.054785e-05,0.000447,4.359957e-09,1.210338e-04,0.000087
Purchase Amount (USD),7.567833e-06,6.655030e-05,0.000076,0.000122,1.460038e-04,0.000509,3.503663e-04,6.232066e-04,0.000059,0.000171,...,2.759135e-04,1.405057e-07,2.903643e-04,2.138674e-04,1.804018e-07,1.222247e-04,0.000094,1.349161e-05,1.127888e-05,0.000065
Review Rating,5.749895e-08,9.827054e-06,0.000006,0.000489,5.374017e-04,0.000453,5.296666e-04,3.435937e-05,0.000838,0.000284,...,9.293030e-06,1.821882e-04,1.274409e-04,1.559671e-04,2.939116e-05,7.422118e-05,0.000010,3.579081e-10,3.067535e-04,0.000037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Preferred Payment Method_Cash,3.567019e-05,7.178871e-07,0.000036,0.000029,3.955285e-04,0.000925,5.948405e-02,1.712036e-01,0.038744,0.004214,...,7.739918e-05,1.943363e-04,8.654893e-05,2.334302e-04,1.760586e-03,5.673422e-07,0.000122,6.814129e-05,5.578639e-04,0.000243
Preferred Payment Method_Credit Card,7.036182e-06,2.716559e-04,0.000086,0.000766,1.094742e-04,0.000426,2.272419e-01,9.107671e-03,0.000005,0.084445,...,1.859866e-05,1.224608e-04,8.887441e-05,6.680667e-04,1.651212e-03,1.435915e-05,0.000105,3.032573e-04,2.204374e-04,0.000025
Preferred Payment Method_Debit Card,6.580666e-06,2.814063e-06,0.000636,0.000293,1.236785e-03,0.002247,2.681816e-02,1.029629e-03,0.178601,0.059793,...,4.809093e-07,2.678593e-04,5.759130e-04,1.941224e-04,6.816454e-04,1.025278e-04,0.000244,3.281405e-04,6.336801e-04,0.000041
Preferred Payment Method_PayPal,4.250100e-06,2.587400e-05,0.000098,0.000853,2.229835e-05,0.005765,4.694000e-03,3.828643e-01,0.079038,0.013780,...,1.257929e-06,1.906819e-04,2.111507e-04,1.628036e-04,3.356466e-04,4.445165e-05,0.000078,6.808283e-05,7.021585e-04,0.000172


In [7]:
def contribution_of_variables(df_loadings, components):
    fig, axes = plt.subplots(1, len(components), figsize=(18, 6), sharey=True)

    for i, component in enumerate(components):

        df_loadings[component].sort_values(ascending=False).plot(kind='bar', ax=axes[i], color='blue')
        axes[i].set_title(f'PCA Loadings for {component}')
        axes[i].set_xticklabels(df_loadings[component].sort_values(ascending=False).index, rotation=90)
        axes[i].set_xlabel('Variables')
        # axes[i].tick_params(axis='x', rotation=45)
        if i == 0:
            axes[i].set_ylabel('Contribution (squared loading)')

    plt.tight_layout()
    plt.show()

# contribution_of_variables(contributions, ['PC1', 'PC2', 'PC3'])

In [8]:
df_pca = pd.DataFrame(data=principal_components)
df_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,1.761927,0.703784,0.923448,-0.703802,-0.482955,-0.512190,0.457955,0.456606,-0.054016,-0.488691,...,-0.011698,0.096767,0.054474,-0.034366,-0.081087,-0.461966,0.526520,0.330350,-0.133935,-0.089609
1,1.761328,0.708914,0.900501,-0.725175,-0.480360,-0.477539,0.108898,-0.092907,0.381250,-0.209057,...,-0.079424,0.027069,-0.017905,0.007131,-0.037944,0.055127,-0.035796,-0.178174,-0.371987,0.092875
2,2.760214,-1.315872,0.857212,-0.625507,0.759861,-0.022085,-0.435657,-0.275199,0.934109,0.175561,...,-0.014432,-0.012036,-0.009020,-0.017052,-0.029465,-0.064026,-0.029271,0.001800,-0.048589,-0.032584
3,2.767982,-0.330652,0.917078,0.663350,0.684889,-0.365868,-0.111870,0.304376,-0.151679,0.958880,...,-0.002107,0.001513,0.005047,-0.020453,-0.049834,-0.036849,-0.021017,-0.027048,0.020223,-0.016308
4,-2.245072,-0.366937,0.912211,-0.537935,0.769784,-0.105426,0.122677,0.452266,1.215111,0.599940,...,0.005958,0.029496,-0.024035,-0.029361,0.024316,-0.045236,0.082083,-0.102743,-0.085502,0.022335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,2.727080,0.711040,-0.760302,-0.328841,-0.283160,0.921516,-0.163052,-0.249112,0.438930,-0.050999,...,0.000862,0.013599,-0.015568,-0.236284,0.030486,-0.003600,-0.100905,-0.001599,-0.041616,-0.004419
3896,0.749744,0.677484,-0.746270,0.378996,0.811301,-0.134953,0.201183,-0.402300,-0.207162,0.139522,...,-0.069091,0.028216,-0.012260,-0.009505,-0.004656,0.028375,-0.005112,-0.024464,-0.042763,0.052954
3897,-1.250603,0.681704,-0.754516,0.298185,0.770582,-0.161454,0.556479,-0.010511,-0.513466,-0.531917,...,-0.027894,-0.011734,0.009725,0.007925,0.009096,-0.003295,-0.033713,-0.034709,0.030259,0.000499
3898,2.761966,-1.299062,-0.801695,0.904845,-0.310175,0.615629,-0.208204,0.093046,-0.122576,0.186783,...,-0.068000,-0.016146,-0.017891,0.027768,-0.035709,0.027272,-0.011983,0.037933,0.024645,0.027623


In [9]:
df_pca.to_csv('../data/pca_one_hot_encode_data_1.csv', index=False)