In [1]:
# Import Statements
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Normalization Cheat Sheet
min_max = lambda x: (x - x.min()) / (x.max() - x.min())
z_score = lambda x: (x - x.mean()) / x.std()

In [2]:
# Assign %pwd to working_dir
working_dir = %pwd

## Reading Pre Processed Data File

In [3]:
# Reading Pre Processed Data File
spotify = pd.read_csv( (working_dir)+"/Data/spotify-pre_processed.csv", delimiter = ",")

In [4]:
# Removing Text Attributes for Principle Component Analysis
spotify_numerical = spotify[['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness','popularity','speechiness','tempo','valence', 'explicit_0','explicit_1','mode_0','mode_1' ]]

In [5]:
from sklearn.decomposition import PCA

# Function: principleComponentAnalysis
# Accepts:  dataset
# Returns:  top ratios and corresponding feature 
#           names for Principle Component Analysis
def principleComponentAnalysis(data):
    number_of_columns = data.shape[1]
    initial_feature_names = data.columns
        
    for x in range(number_of_columns):
        pca = PCA(n_components = x)
        pca.fit_transform(data)
        pca_ratios = pca.explained_variance_ratio_
        
        # https://www.geeksforgeeks.org/how-to-get-column-names-in-pandas-dataframe/
        # block of code for returning names.
        most_important = [np.abs(pca.components_[i]).argmax() for i in range(x)]
        pca_names = [initial_feature_names[most_important[i]] for i in range(x)]
        
        if sum(pca_ratios) >= .95:
            break
    return pca_ratios, pca_names

In [6]:
# Testing principleComponentAnalysis Function
pca_ratios, pca_names = principleComponentAnalysis(spotify_numerical)

In [7]:
# Printing Feature Names
pca_names

['mode_0',
 'acousticness',
 'explicit_1',
 'instrumentalness',
 'key',
 'valence',
 'popularity',
 'liveness',
 'popularity']

In [8]:
# Printing Ratios
pca_ratios

array([0.35019289, 0.19765849, 0.10340963, 0.08541465, 0.08234835,
       0.0591023 , 0.03579184, 0.02850354, 0.02039043])

In [9]:
# PCA Summary
pca_sum = sum(pca_ratios)
pca_length = len(pca_ratios)

print(str(pca_length) + ' Attributes Accounted For ' + "{:.2f}".format(pca_sum) + ' Of The Variance')

9 Attributes Accounted For 0.96 Of The Variance
