In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras import layers
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [67]:
top = pd.read_csv('top200.csv')
bottom = pd.read_csv('bottom200.csv')
top['type'] = 0 # 0 for top
bottom['type'] = 1 # 1 for bottom
df = pd.concat([top, bottom], ignore_index=True)
df = pd.get_dummies(df, columns=['genre'])
df = df.drop(['title', 'artist', 'rank', 'lyrics', 'id'], axis = 1)

In [68]:
Y = df.pop('type')
x = df.values #returns a numpy array

# normalizing the dataset
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X = pd.DataFrame(x_scaled)
X.fillna(0, inplace=True)

In [69]:
pca = PCA(0.95)
pca.fit(X)
print(f'Number of components that can retain 95% of the variance: {pca.n_components_}')

Number of components that can retain 95% of the variance: 14


In [70]:
n_pcs= pca.components_.shape[0]

# get the index of the most important feature on EACH component
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]

initial_feature_names = df.columns
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
l = [[f'PC{i}', f'{pca.explained_variance_ratio_[i]*100:.1f}%', most_important_names[i]] for i in range(n_pcs)]
df = pd.DataFrame(l, columns=['Component', 'Variance ratio', 'Most important feature'])

In [71]:
df

Unnamed: 0,Component,Variance ratio,Most important feature
0,PC0,20.7%,genre_rap
1,PC1,13.6%,autumn
2,PC2,13.2%,spring
3,PC3,10.6%,mode
4,PC4,10.2%,autumn
5,PC5,7.5%,summer
6,PC6,4.6%,key
7,PC7,4.3%,key
8,PC8,3.8%,genre_rock
9,PC9,2.2%,valence
