In [4]:
# Import required libraries
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import plotly.express as px

# Load scaled dataset
df = pd.read_csv("scaled_dataset.csv")  # Ensure this CSV is in the same folder

# Define feature columns (should match what was scaled)
features = ['danceability', 'energy', 'key', 'loudness', 'mode',
            'speechiness', 'acousticness', 'instrumentalness',
            'liveness', 'valence', 'tempo']

X = df[features]

# Apply KMeans Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster_label'] = kmeans.fit_predict(X)

# Create PCA Pipeline to reduce to 2D
pca_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2))
])
song_embedding = pca_pipeline.fit_transform(X)

# Create projection DataFrame for plotting
projection = pd.DataFrame(song_embedding, columns=['x', 'y'])
projection['title'] = df['name']
projection['artist'] = df['artists']
projection['year'] = df['year']
projection['cluster'] = df['cluster_label'].astype(str)

# Create interactive scatter plot
fig = px.scatter(
    projection,
    x='x',
    y='y',
    color='cluster',
    hover_data=['title', 'artist', 'year'],
    title='🎧 PCA Projection of Songs Clustered by Audio Features',
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.update_layout(legend_title_text='Cluster')

# Show plot
fig.write_html("pca_cluster_plot.html")
print("✅ Plot saved! Open 'pca_cluster_plot.html' in your browser.")






✅ Plot saved! Open 'pca_cluster_plot.html' in your browser.


In [8]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load dataset
df = pd.read_csv("data_by_genres.csv")

# Optional: Cluster genres using KMeans (e.g., into 5 clusters)
X = df.select_dtypes(include=[np.number])
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

# t-SNE dimensionality reduction pipeline
tsne_pipeline = make_pipeline(StandardScaler(), TSNE(n_components=2, random_state=42, perplexity=10, n_iter=1000, verbose=1))
tsne_result = tsne_pipeline.fit_transform(X)

# Prepare DataFrame for visualization
projection = pd.DataFrame(tsne_result, columns=['x', 'y'])
projection['genres'] = df['genres']
projection['cluster'] = df['cluster'].astype(str)  # Convert to str for proper color labeling

# Plot with Plotly
fig = px.scatter(
    projection, x='x', y='y',
    color='cluster',
    hover_data=['genres'],
    title='t-SNE Visualization of Genres by Audio Features'
)

# Try showing the plot, or fallback to HTML export
try:
    fig.show()
except Exception as e:
    print("⚠️ Could not display plot inline. Saving as HTML instead...")
    fig.write_html("tsne_genres_plot.html")
    print("✅ Plot saved as 'tsne_genres_plot.html'. Open in a browser.")






[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 2973 samples in 0.004s...
[t-SNE] Computed neighbors for 2973 samples in 0.170s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2973
[t-SNE] Computed conditional probabilities for sample 2000 / 2973
[t-SNE] Computed conditional probabilities for sample 2973 / 2973
[t-SNE] Mean sigma: 0.599325
[t-SNE] KL divergence after 250 iterations with early exaggeration: 84.300743
[t-SNE] KL divergence after 1000 iterations: 1.456463
⚠️ Could not display plot inline. Saving as HTML instead...
✅ Plot saved as 'tsne_genres_plot.html'. Open in a browser.
