# Genre Clustering

In [None]:
import pandas as pd

### Merge Spotify Features & Lyrical dataset & Cleaning spotify genres

In [None]:
import statistics as stats
features_lyrics = pd.read_csv('../data/compressed/lyrics_spotify_features.csv')

song_emotions = pd.read_csv('../data/compressed/songs_emotions.csv')

features_lyrics = features_lyrics.merge(song_emotions, on= ['title', 'artist'], how='left')
features_lyrics.drop(columns=['lyrics'], inplace=True)



new_genres =  ["Alternative", "Blues", "Country", "Dance", "Disco", "EDM", "Electronic", "Folk", "Funk",
                  "Hip Hop", "House", "Indie", "Jazz", "Metal", "Motown", "Pop", "R&B", "Rap", "Rock", "Soul"]

ng_map = {g.lower(): g for g in new_genres}

genius_map = {'rb': 'R&B',
              'rock': 'Rock',
              'pop': 'Pop',
              'rap': 'Rap',}


merge_map = {"EDM": "Electronic",
             "House": "Electronic",
             "Indie": "Alternative"}

special_genres = ["Hip Hop", "R&B", "Motown"]

def get_main_genre(genre, genius_genre):
    if not genre or not isinstance(genre, str):
        
    
        if genius_genre:
            return genius_map.get(genius_genre.lower(), genius_genre.title())
        else:
            return ""
    cleaned = genre.replace("[", "").replace("]", "").replace("'", "").replace('"', '')

    genres = [genre.strip() for genre in cleaned.split(',') if genre.strip() != '']
  

    genres_list = []

    for genre in genres:
        genre_clean = genre.replace("_", " ").strip()
        if genre_clean == "":
            continue
        
        base_genre = None
        for special in special_genres:
            if special.lower() in genre_clean.lower():
                base_genre = special
                break
            
        if base_genre is None:
            last = genre_clean.split()[-1].lower()
            base_genre = ng_map.get(last, None)
            
                
        if base_genre in merge_map:
            base_genre = merge_map[base_genre]


        if base_genre is not None:
            genres_list.append(base_genre)

        
    if genres_list:
        final_genres = stats.mode(genres_list) # returns most common genre in a list per song
    else:
        if genius_genre:
            final_genres = genius_map.get(genius_genre.lower(), genius_genre.title())
     

    return final_genres




features_lyrics['mapped_genres'] = features_lyrics.apply(lambda x: get_main_genre(x['spotify_genre_list'], x['genius_genre']), axis=1)
features_lyrics = features_lyrics[features_lyrics['mapped_genres'] != 'Misc']

features_lyrics.drop(columns=['spotify_genre_list', 'genius_genre', 'top_emotion', 'topEmotionWord'], inplace=True)

print(features_lyrics['mapped_genres'].value_counts())
features_lyrics.head()
features_lyrics.columns


## K-means Clustering Implementation

### Preparing Clustering

#### Converting emotion_scores into Dictionary

In [3]:
import ast
import numpy as np
def convert_dictionary(x):
    if isinstance(x,str):
        return ast.literal_eval(x)
    return x
features_lyrics['emotions_scores'] = features_lyrics['emotions_scores'].apply(lambda x : convert_dictionary(x))
features_lyrics['positivity'] = features_lyrics['emotions_scores'].apply(
    lambda x: x.get('positive', 0) if isinstance(x, dict) else 0
)
features_lyrics[features_lyrics['positivity'].isna()]


Unnamed: 0,title,artist,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,emotions_scores,mapped_genres,positivity


#### Fetching Top Sentiment & Cleaning

#### Fetching Emotion Scores & Cleaning

In [None]:
# extracts emotion scores and stores them in separate columns
emotion_df = pd.json_normalize(features_lyrics['emotions_scores'])
emotion_df = emotion_df.fillna(0.0)
emotion_df.index = features_lyrics.index
features_lyrics = pd.concat([features_lyrics, emotion_df], axis=1)
features_lyrics.columns
features_lyrics[emotion_df.columns]




### Spotify Feature Clusters For Each Genre, Top Sentiment and Top Emotion

In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score



spotify_features_list = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'anticipation', 'joy', 'surprise', 'trust', 'anger', 'disgust', 'fear', 'sadness', 'positive']

clusters = 2
for genre, songs_in_genre, in features_lyrics.groupby('mapped_genres'): # for each genre

    samples = len(songs_in_genre)
    if samples == 0: continue
   

    k = min(clusters, samples)

    spotify_features = songs_in_genre[spotify_features_list]

    scaler = StandardScaler()
    spotify_features_scaled = scaler.fit_transform(spotify_features)

    kmeans = KMeans(n_clusters = k, random_state = 42)
    label = kmeans.fit_predict(spotify_features_scaled)
            

    cluster_names = [f"{genre}_cluster_{i+1}" for i in label]

    features_lyrics.loc[songs_in_genre.index, 'cluster'] = cluster_names


            



### Analyzing & Classifying The Clusters

In [6]:
print("Number of Clusters", features_lyrics['cluster'].unique().size)
print(features_lyrics['cluster'].unique())
features_lyrics['cluster'].value_counts()


Number of Clusters 34
['Rap_cluster_2' 'Hip Hop_cluster_1' 'Hip Hop_cluster_2' 'Pop_cluster_1'
 'Electronic_cluster_1' 'Rock_cluster_1' 'Rock_cluster_2' 'Soul_cluster_2'
 'Metal_cluster_2' 'Country_cluster_2' 'Pop_cluster_2' 'Folk_cluster_2'
 'Funk_cluster_2' 'Soul_cluster_1' 'Metal_cluster_1' 'Disco_cluster_1'
 'Electronic_cluster_2' 'Rap_cluster_1' 'Blues_cluster_1' 'R&B_cluster_2'
 'R&B_cluster_1' 'Folk_cluster_1' 'Dance_cluster_2'
 'Alternative_cluster_2' 'Disco_cluster_2' 'Funk_cluster_1'
 'Jazz_cluster_1' 'Country_cluster_1' 'Motown_cluster_2'
 'Alternative_cluster_1' 'Jazz_cluster_2' 'Motown_cluster_1'
 'Blues_cluster_2' 'Dance_cluster_1']


cluster
Rock_cluster_2           8025
Pop_cluster_2            5033
Rap_cluster_2            2412
Rock_cluster_1           2340
Pop_cluster_1            2113
Metal_cluster_1          1287
Hip Hop_cluster_2        1098
Soul_cluster_1            598
Hip Hop_cluster_1         589
Country_cluster_1         580
R&B_cluster_2             501
Metal_cluster_2           388
Electronic_cluster_2      357
Folk_cluster_2            346
Folk_cluster_1            228
Country_cluster_2         195
Soul_cluster_2            158
Alternative_cluster_2     144
R&B_cluster_1             140
Jazz_cluster_2            136
Funk_cluster_2            135
Disco_cluster_1           124
Dance_cluster_2           106
Alternative_cluster_1     101
Blues_cluster_1            93
Jazz_cluster_1             85
Electronic_cluster_1       71
Funk_cluster_1             68
Rap_cluster_1              60
Motown_cluster_2           21
Disco_cluster_2            11
Blues_cluster_2            11
Motown_cluster_1            9
Da

In [7]:
cluster_avgs = features_lyrics.groupby('cluster')[spotify_features_list].mean()
cluster_avgs = (cluster_avgs - cluster_avgs.min()) / (cluster_avgs.max() - cluster_avgs.min())
print(cluster_avgs)



In [8]:
print(features_lyrics['cluster'].value_counts())
features_lyrics.head(10)

features_lyrics.to_csv('../data/processed/lyrics_features_clusters.csv', index=False)

In [None]:
group = features_lyrics.groupby(['cluster', 'year']).size().reset_index(name='count')

table = group.pivot_table(index='year', columns='cluster', values='count', fill_value=0)

with open('../data/processed/clusters.txt', 'w') as f:
    f.write(table.to_string())

### Visualization

### Mappings

In [None]:
print(features_lyrics['cluster'].value_counts())
features_lyrics.head(10)

Unnamed: 0,title,artist,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,...,joy,positive,surprise,trust,anger,disgust,fear,negative,sadness,cluster,Unnamed: 22
0,Can I Live,JAY-Z,1996,0.628,0.692,-12.365,0.437,0.0823,0.0,0.161,...,9.0,29.0,8.0,17.0,12.0,9.0,12.0,22.0,10.0,Rap_cluster_2,Energetic Thrilling Rap
1,Money On My Mind,Lil Wayne,2005,0.535,0.772,-6.503,0.37,0.0127,0.0,0.11,...,40.0,44.0,34.0,36.0,39.0,13.0,12.0,22.0,10.0,Rap_cluster_2,Energetic Thrilling Rap
2,Mr. Carter,Lil Wayne,2008,0.485,0.71,-6.288,0.364,0.0444,0.0,0.35,...,4.0,16.0,11.0,11.0,11.0,12.0,19.0,32.0,22.0,Rap_cluster_2,Energetic Thrilling Rap
3,C.R.E.A.M.,Wu-Tang Clan,1994,0.479,0.549,-10.551,0.373,0.57,0.0239,0.127,...,25.0,37.0,21.0,20.0,27.0,9.0,14.0,26.0,9.0,Hip Hop_cluster_1,Speechy Smooth Hip Hop
4,Barry Bonds,Kanye West,2007,0.48,0.624,-6.131,0.382,0.0451,0.0,0.337,...,19.0,28.0,4.0,10.0,23.0,13.0,12.0,26.0,7.0,Rap_cluster_2,Energetic Thrilling Rap
5,Fireman,Lil Wayne,2005,0.682,0.723,-4.628,0.211,0.0109,0.0,0.406,...,9.0,11.0,7.0,19.0,7.0,0.0,20.0,10.0,5.0,Rap_cluster_2,Energetic Thrilling Rap
6,Juicy,The Notorious B.I.G.,1994,0.887,0.767,-4.311,0.295,0.38,0.0,0.0527,...,27.0,34.0,22.0,26.0,10.0,7.0,10.0,17.0,11.0,Hip Hop_cluster_1,Speechy Smooth Hip Hop
7,The What,The Notorious B.I.G.,1994,0.622,0.714,-8.244,0.411,0.191,0.0,0.106,...,6.0,18.0,4.0,9.0,21.0,17.0,16.0,27.0,12.0,Hip Hop_cluster_2,Danceable Menacing Hip Hop
8,We Gonna Make It,Jadakiss,2001,0.514,0.961,-3.726,0.347,0.178,2e-06,0.146,...,11.0,20.0,9.0,16.0,16.0,14.0,18.0,29.0,10.0,Rap_cluster_2,Energetic Thrilling Rap
9,Back That Azz Up,Juvenile,1998,0.874,0.714,-6.664,0.129,0.0155,0.0,0.0968,...,15.0,19.0,14.0,17.0,29.0,13.0,15.0,60.0,14.0,Rap_cluster_2,Energetic Thrilling Rap


In [9]:
group = features_lyrics.groupby(['cluster', 'year']).size().reset_index(name='count')

table = group.pivot_table(index='year', columns='cluster', values='count', fill_value=0)

with open('../data/processed/clusters.txt', 'w') as f:
    f.write(table.to_string())

### Visualization

### Mappings

In [10]:
cluster_names_df = pd.read_csv('../data/raw/cluster_names.csv', header=None, names=['cluster', 'name'], sep=':')

cluster_names_df['cluster'] = cluster_names_df['cluster'].str.strip()
cluster_names_df['name'] = cluster_names_df['name'].str.replace(',', '').str.strip()

# print(cluster_names_df.head())

cluster_mapping = dict(zip(cluster_names_df['cluster'], cluster_names_df['name']))

features_lyrics['Cluster Name'] = features_lyrics['cluster'].map(cluster_mapping)

features_lyrics.head(10)

features_lyrics.to_csv('../data/processed/lyrics_features_clusters.csv', index=False)

In [11]:
import seaborn as sns
import matplotlib.pyplot as plt

# Top 5 Clusters Overall -> I did a heatmap of all five genres and their attributes

# Top 3 Clusters per Genre -> I did a multi-bar chart, where there were three groups of bars representing the three clusters, and in each group a singular bar represented a feature
                                # you have to normalize the values others the bars will come out messed up, also helps to set the lower range to 0.1 so the lower values are still visible)


### Gets the highest feature + emotion per cluster

In [None]:
# Read the CSV file
df = pd.read_csv('../data/raw/cluster_avgs.csv')

# Define feature and emotion columns
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
            'instrumentalness', 'liveness', 'valence', 'tempo']
emotions = ['anticipation', 'joy', 'surprise', 'trust', 'anger', 'disgust', 
            'fear', 'sadness', 'positive']

# Process each cluster
results = []
for _, row in df.iterrows():
    cluster_name = row['cluster']
    
    # Find highest feature
    feature_values = {feat: row[feat] for feat in features}
    highest_feature = max(feature_values, key=feature_values.get)
    highest_feature_value = feature_values[highest_feature]
    
    # Find highest emotion
    emotion_values = {emo: row[emo] for emo in emotions}
    highest_emotion = max(emotion_values, key=emotion_values.get)
    highest_emotion_value = emotion_values[highest_emotion]
    
    results.append({
        'cluster': cluster_name,
        'highest_feature': highest_feature,
        'feature_value': highest_feature_value,
        'highest_emotion': highest_emotion,
        'emotion_value': highest_emotion_value
    })

# Create results dataframe and display
#results_df = pd.DataFrame(results)
#print(results_df.to_string(index=False))

# Save to CSV
#results_df.to_csv('../data/raw/cluster_highest_values.csv', index=False)

### Top 5 Clusters Overall:

In [None]:
top_5_cluster_names = features_lyrics['Cluster Name'].value_counts().head(5).index.tolist()
top_5_df = features_lyrics[features_lyrics['Cluster Name'].isin(top_5_cluster_names)]

cluster_avgs = top_5_df.groupby('Cluster Name')[spotify_features_list].mean()
cluster_avgs_norm = (cluster_avgs - cluster_avgs.min()) / (cluster_avgs.max() - cluster_avgs.min())

plt.figure(figsize=(14, 6))
sns.heatmap(cluster_avgs_norm, annot=True, fmt='.2f', cmap='YlOrRd', 
            cbar_kws={'label': 'Normalized Value'}, linewidths=0.5)
plt.title('Feature Profiles of Top 5 Clusters', fontsize=14, fontweight='bold')
plt.xlabel('Features', fontsize=12)
plt.ylabel('Cluster Name', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### Top 3 Genres and their respective clusters:

In [None]:
from sklearn.preprocessing import MinMaxScaler

top_3_genres = features_lyrics['mapped_genres'].value_counts().head(3).index.tolist()
print("Top 3 Genres:", top_3_genres)

cluster_audio_avgs = pd.read_csv('../data/raw/cluster_avgs.csv', index_col='cluster')

### (a) Rock

In [None]:
genre = top_3_genres[0]
genre_df = features_lyrics[features_lyrics['mapped_genres'] == genre]
top_2_clusters_genre = genre_df['Cluster Name'].value_counts().head(2).index.tolist()
print(f"Top 2 Clusters for {genre}: {top_2_clusters_genre}")

reverse_cluster_mapping = {v: k for k, v in cluster_mapping.items()}

top_2_clusters_generic = [reverse_cluster_mapping[name] for name in top_2_clusters_genre]
print(f"Generic cluster names: {top_2_clusters_generic}")

cluster_audio_avgs_filtered = cluster_audio_avgs[cluster_audio_avgs.index.isin(top_2_clusters_generic)]

audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
cluster_audio_avgs_filtered = cluster_audio_avgs_filtered[audio_features]

cluster_melted = cluster_audio_avgs_filtered.reset_index()
cluster_melted['cluster'] = cluster_melted['cluster'].map(cluster_mapping)
cluster_melted = cluster_melted.melt(
    id_vars='cluster',
    var_name='Audio Feature',
    value_name='Value'
)

plt.figure(figsize=(14, 6))
sns.barplot(data=cluster_melted, x='cluster', y='Value', 
            hue='Audio Feature', palette='Set3')
plt.xlabel('Cluster Name', fontsize=12)
plt.ylabel('Normalized Value (0-1 scale)', fontsize=12)
plt.title(f'{genre}: Audio Features Comparison for Top 2 Clusters', fontsize=14, fontweight='bold')
plt.ylim(0, 1.1)
plt.xticks(rotation=15, ha='right')
plt.legend(title='Audio Feature', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

### (b) Pop

In [None]:
genre = top_3_genres[1]
genre_df = features_lyrics[features_lyrics['mapped_genres'] == genre]
top_2_clusters_genre = genre_df['Cluster Name'].value_counts().head(2).index.tolist()
print(f"Top 2 Clusters for {genre}: {top_2_clusters_genre}")

reverse_cluster_mapping = {v: k for k, v in cluster_mapping.items()}

top_2_clusters_generic = [reverse_cluster_mapping[name] for name in top_2_clusters_genre]
print(f"Generic cluster names: {top_2_clusters_generic}")

cluster_audio_avgs_filtered = cluster_audio_avgs[cluster_audio_avgs.index.isin(top_2_clusters_generic)]

audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
cluster_audio_avgs_filtered = cluster_audio_avgs_filtered[audio_features]

cluster_melted = cluster_audio_avgs_filtered.reset_index()
cluster_melted['cluster'] = cluster_melted['cluster'].map(cluster_mapping)
cluster_melted = cluster_melted.melt(
    id_vars='cluster',
    var_name='Audio Feature',
    value_name='Value'
)

plt.figure(figsize=(14, 6))
sns.barplot(data=cluster_melted, x='cluster', y='Value', 
            hue='Audio Feature', palette='Set3')
plt.xlabel('Cluster Name', fontsize=12)
plt.ylabel('Normalized Value (0-1 scale)', fontsize=12)
plt.title(f'{genre}: Audio Features Comparison for Top 2 Clusters', fontsize=14, fontweight='bold')
plt.ylim(0, 1.1)
plt.xticks(rotation=15, ha='right')
plt.legend(title='Audio Feature', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

### (c) Rap

In [None]:
genre = top_3_genres[2]
genre_df = features_lyrics[features_lyrics['mapped_genres'] == genre]
top_2_clusters_genre = genre_df['Cluster Name'].value_counts().head(2).index.tolist()
print(f"Top 2 Clusters for {genre}: {top_2_clusters_genre}")

reverse_cluster_mapping = {v: k for k, v in cluster_mapping.items()}

top_2_clusters_generic = [reverse_cluster_mapping[name] for name in top_2_clusters_genre]
print(f"Generic cluster names: {top_2_clusters_generic}")

cluster_audio_avgs_filtered = cluster_audio_avgs[cluster_audio_avgs.index.isin(top_2_clusters_generic)]

audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
cluster_audio_avgs_filtered = cluster_audio_avgs_filtered[audio_features]

cluster_melted = cluster_audio_avgs_filtered.reset_index()
cluster_melted['cluster'] = cluster_melted['cluster'].map(cluster_mapping)
cluster_melted = cluster_melted.melt(
    id_vars='cluster',
    var_name='Audio Feature',
    value_name='Value'
)

plt.figure(figsize=(14, 6))
sns.barplot(data=cluster_melted, x='cluster', y='Value', 
            hue='Audio Feature', palette='Set3')
plt.xlabel('Cluster Name', fontsize=12)
plt.ylabel('Normalized Value (0-1 scale)', fontsize=12)
plt.title(f'{genre}: Audio Features Comparison for Top 2 Clusters', fontsize=14, fontweight='bold')
plt.ylim(0, 1.1)
plt.xticks(rotation=15, ha='right')
plt.legend(title='Audio Feature', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()