<a href="https://colab.research.google.com/github/LNickelsburg/clusterify/blob/main/clusterify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

## prep the dataset

In [4]:
columns=['id','users','title','artist',
         'release_date','acousticness','danceability','duration_ms','energy',
         'explicit','instrumentalness','key','liveness','loudness','popularity',
         'mode','speechiness','tempo','time_signature','valence']

In [5]:
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/spotify_app/dataset.csv',
                    usecols=columns)

In [51]:
# get rid of empty columns (for if one of the api calls didnt work right)
dataset = dataset.dropna(axis=1, how='all')

# get rid of rows missing information
dataset = dataset.dropna(axis=0, how='any')

In [9]:
# turn release date into year integer
for i, date in enumerate(dataset['release_date']):
    date = int(date[:4])
    dataset.loc[i, 'release_date'] = date

In [7]:
# split into features, user labels, and title/artist labels

variables=['release_date',
           'acousticness',
           'danceability',
           'duration_ms',
           'energy',
           'explicit',
           'instrumentalness',
           'key',
           'liveness',
           'loudness',
           'popularity',
           'mode',
           'speechiness',
           'tempo',
           'time_signature',
           'valence']

features = dataset[variables]
users = dataset['users']
titles = dataset['title', 'artist']

In [10]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [11]:
flattened_users = []
for user in users:
    user_list = user.split(',')
    flattened_users.extend([u.strip("'") for u in user_list])

unique_users = np.unique(flattened_users)

In [None]:
# give the users all their own colors :)
colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_users)))
user_colors = dict(zip(unique_users, colors))
print(user_colors)

In [61]:
# if you wanna personalize the colors

user_colors['Justin'] = 'blue'
user_colors['Leah'] = 'coral'

## distribution visualization (just for funsies)

In [None]:
# distribution

feature_names = features.columns
num_features = scaled_features.shape[1]

ncols = 3
nrows = (num_features+2)//ncols

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 3 * nrows))
axes = axes.ravel()

min_feature_value = np.min(scaled_features)
max_feature_value = np.max(scaled_features)
num_bins=50
bins = np.linspace(min_feature_value, max_feature_value, num_bins + 1)

# Plot histograms for each feature
for i in range(num_features):
    ax = axes[i]
    for user_index, user in enumerate(unique_users):
        # Mask to select only the rows for the current user
        user_mask = np.array(users) == user
        user_features = scaled_features[user_mask, i]

        # Plot histogram for the current user and feature
        ax.hist(user_features, bins=bins, color=user_colors[user], alpha=0.5,
                edgecolor='black',label=user)

    ax.set_title(f'Distribution of {feature_names[i]}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    ax.legend(loc='upper right', fontsize='x-small')

# Hide any unused axes
for i in range(num_features, nrows * 3):
    fig.delaxes(axes[i])

# Adjust layout and show plot
plt.tight_layout()
plt.show()

# K-Means and PCA

In [14]:
# Apply K-Means Clustering
kmeans = KMeans(n_clusters=2, n_init=1000)  # Choose the number of clusters
kmeans.fit(scaled_features)
labels = kmeans.labels_


In [15]:
# Dimensionality Reduction for Visualization
pca = PCA(n_components=3)
reduced_features = pca.fit_transform(scaled_features)


In [None]:
# plot, with colors representing k-means clusters
fig = plt.figure(figsize=(14, 7))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(reduced_features[:, 0], reduced_features[:, 1], reduced_features[:, 2], c=labels, alpha=0.7)

ax.view_init(elev=0, azim=90)
ax.set_xlabel('PCA Feature 1')
ax.set_ylabel('PCA Feature 2')
ax.set_zlabel('PCA Feature 3')
ax.set_title('3D PCA K-Means Cluster Visualization')

plt.show()

In [None]:
# plot, with colors representing users
fig = plt.figure(figsize=(14, 7))
ax = fig.add_subplot(111, projection='3d')

for user_index, user in enumerate(unique_users):
    user_mask = np.array(users) == user  # Create a mask for the current user
    user_features = reduced_features[user_mask]  # Apply mask to features

    # Use the mask to plot only this user's features
    ax.scatter(user_features[:, 0], user_features[:, 1], user_features[:, 2], color=user_colors[user], label=user, alpha=0.7)

ax.view_init(elev=30, azim=-45)
ax.set_xlabel('PCA Feature 1')
ax.set_ylabel('PCA Feature 2')
ax.set_zlabel('PCA Feature 3')
ax.set_title('3D PCA Visualization, By User')

# Create a legend
plt.legend(title="User Names", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.show()


# PCA

In [None]:
principal_components = pd.DataFrame(data=reduced_features, columns=['PC1', 'PC2', 'PC3'])

explained_variance = pca.explained_variance_ratio_
print(f"Explained variance by component: {explained_variance}")


loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
loading_matrix = pd.DataFrame(loadings, columns=['PC1', 'PC2', 'PC3'], index=features.columns)
print(loading_matrix)

In [None]:
ranked_PC1 = loading_matrix.abs().sort_values(by='PC1', ascending=False)
ranked_PC2 = loading_matrix.abs().sort_values(by='PC2', ascending=False)
ranked_PC3 = loading_matrix.abs().sort_values(by='PC3', ascending=False)


print("Features sorted by their impact on PC1:")
print(ranked_PC1['PC1'])

print("\nFeatures sorted by their impact on PC2:")
print(ranked_PC2['PC2'])

print("\nFeatures sorted by their impact on PC3:")
print(ranked_PC3['PC3'])