## Import the libraries needed

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

## Loading our dataset

In [None]:
df = pd.read_csv('final.csv')
len(df)

In [None]:
df.head()

## Data pre-processing

Make key and mode into categorical values

In [None]:
df['key'] = df['key'].astype('category')

In [None]:
df['mode'] = df['mode'].astype('category')

Shuffle the dataframe

In [None]:
df = df.sample(frac=1, random_state=42)

Remove duplicates from the dataset

In [None]:
df = df.drop_duplicates()

len(df)

Drop extra columns (by keeping the columns we want)

In [None]:
df = df[['danceability', 'energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo', 'preference']]
# preference col can be used for other analysis
df.head()

Split data into features and target variable. Although, in this case, we will not be using our target variable

In [None]:
X = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

Define a scaling function for numpy arrays. This function makes sure that our numeric values have a mean of 0 and a standard deviation of 1.

In [None]:
def scale_features(X):
    for i in range(X.shape[1]):
        col = X[:, i]
        if np.issubdtype(col.dtype, np.number):  # check if the column is numeric
            col = col.astype(float)
            X[:, i] = (col - col.mean()) / col.std()  # scale the column

Apply the scaling function to X

In [None]:
scale_features(X)

X[:5]

Perform PCA with 2 components

In [None]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)

Create a scatter plot of the feature vectors

In [None]:
plt.scatter(principal_components[:, 0], principal_components[:, 1])
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA of the data")
plt.show()

Get the principal axes in feature space

In [None]:
feature_vectors = pca.components_.T

Set scaling factor for feature vectors

In [None]:
scale_factor = 4

Plot the feature vectors

In [None]:
plt.scatter(principal_components[:, 0], principal_components[:, 1])
for i, feature_vector in enumerate(feature_vectors):
    plt.arrow(0, 0, feature_vector[0]*scale_factor, feature_vector[1]*scale_factor, 
              color='r', alpha=0.5, linewidth=2*scale_factor, 
              head_width=0.1*scale_factor, head_length=0.1*scale_factor)
    plt.text(feature_vector[0]*scale_factor*1.15, feature_vector[1]*scale_factor*1.15, 
             df.columns[:-1][i], color='r', ha='center', va='center')
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title('Revised PCA')    
plt.show()

In [None]:
# plt.savefig('pca.png')