# Genre Clustering

In [1]:
import pandas as pd

### Merge Spotify Features & Lyrical dataset & Cleaning spotify genres

In [2]:
import statistics as stats
features_lyrics = pd.read_csv('../data/compressed/lyrics_spotify_features.csv')

song_emotions = pd.read_csv('../data/compressed/songs_emotions.csv')

features_lyrics = features_lyrics.merge(song_emotions, on= ['title', 'artist'], how='left')
features_lyrics.drop(columns=['lyrics'], inplace=True)



new_genres =  ["Alternative", "Blues", "Country", "Dance", "Disco", "EDM", "Electronic", "Folk", "Funk",
                  "Hip Hop", "House", "Indie", "Jazz", "Metal", "Motown", "Pop", "R&B", "Rap", "Rock", "Soul"]

ng_map = {g.lower(): g for g in new_genres}

genius_map = {'rb': 'R&B',
              'rock': 'Rock',
              'pop': 'Pop',
              'rap': 'Rap',}


merge_map = {"EDM": "Electronic",
             "House": "Electronic",
             "Indie": "Alternative"}

special_genres = ["Hip Hop", "R&B", "Motown"]

def get_main_genre(genre, genius_genre):
    if not genre or not isinstance(genre, str):
        
    
        if genius_genre:
            return genius_map.get(genius_genre.lower(), genius_genre.title())
        else:
            return ""
    cleaned = genre.replace("[", "").replace("]", "").replace("'", "").replace('"', '')

    genres = [genre.strip() for genre in cleaned.split(',') if genre.strip() != '']
  

    genres_list = []

    for genre in genres:
        genre_clean = genre.replace("_", " ").strip()
        if genre_clean == "":
            continue
        
        base_genre = None
        for special in special_genres:
            if special.lower() in genre_clean.lower():
                base_genre = special
                break
            
        if base_genre is None:
            last = genre_clean.split()[-1].lower()
            base_genre = ng_map.get(last, None)
            
                
        if base_genre in merge_map:
            base_genre = merge_map[base_genre]


        if base_genre is not None:
            genres_list.append(base_genre)

        
    if genres_list:
        final_genres = stats.mode(genres_list) # returns most common genre in a list per song
    else:
        if genius_genre:
            final_genres = genius_map.get(genius_genre.lower(), genius_genre.title())
     

    return final_genres




features_lyrics['mapped_genres'] = features_lyrics.apply(lambda x: get_main_genre(x['spotify_genre_list'], x['genius_genre']), axis=1)
features_lyrics = features_lyrics[features_lyrics['mapped_genres'] != 'Misc']

features_lyrics.drop(columns=['spotify_genre_list', 'genius_genre', 'top_emotion', 'topEmotionWord'], inplace=True)

print(features_lyrics['mapped_genres'].value_counts())
features_lyrics.head()
features_lyrics.columns


mapped_genres
Rock           10365
Pop             7146
Rap             2472
Hip Hop         1687
Metal           1675
Country          775
Soul             756
R&B              641
Folk             574
Electronic       428
Alternative      245
Jazz             221
Funk             203
Disco            135
Dance            114
Blues            104
Motown            30
Name: count, dtype: int64


Index(['title', 'artist', 'year', 'danceability', 'energy', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'emotions_scores', 'mapped_genres'],
      dtype='object')

## K-means Clustering Implementation

### Preparing Clustering

#### Converting emotion_scores into Dictionary

In [3]:
import ast
import numpy as np
def convert_dictionary(x):
    if isinstance(x,str):
        return ast.literal_eval(x)
    return x
features_lyrics['emotions_scores'] = features_lyrics['emotions_scores'].apply(lambda x : convert_dictionary(x))
features_lyrics['positivity'] = features_lyrics['emotions_scores'].apply(
    lambda x: x.get('positive', 0) if isinstance(x, dict) else 0
)
features_lyrics[features_lyrics['positivity'].isna()]


Unnamed: 0,title,artist,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,emotions_scores,mapped_genres,positivity


#### Fetching Top Sentiment & Cleaning

#### Fetching Emotion Scores & Cleaning

In [4]:
# extracts emotion scores and stores them in separate columns
emotion_df = pd.json_normalize(features_lyrics['emotions_scores'])
emotion_df = emotion_df.fillna(0.0)
emotion_df.index = features_lyrics.index
features_lyrics = pd.concat([features_lyrics, emotion_df], axis=1)
features_lyrics.columns
features_lyrics[emotion_df.columns]




Unnamed: 0,anticipation,joy,positive,surprise,trust,anger,disgust,fear,negative,sadness
0,12.0,9.0,29.0,8.0,17.0,12.0,9.0,12.0,22.0,10.0
1,36.0,40.0,44.0,34.0,36.0,39.0,13.0,12.0,22.0,10.0
2,10.0,4.0,16.0,11.0,11.0,11.0,12.0,19.0,32.0,22.0
3,30.0,25.0,37.0,21.0,20.0,27.0,9.0,14.0,26.0,9.0
4,8.0,19.0,28.0,4.0,10.0,23.0,13.0,12.0,26.0,7.0
...,...,...,...,...,...,...,...,...,...,...
27615,6.0,5.0,6.0,2.0,5.0,2.0,1.0,2.0,7.0,4.0
27616,15.0,4.0,8.0,7.0,20.0,4.0,2.0,3.0,6.0,1.0
27617,16.0,12.0,24.0,5.0,19.0,20.0,14.0,23.0,35.0,22.0
27618,2.0,3.0,4.0,1.0,3.0,0.0,0.0,0.0,1.0,1.0


### Spotify Feature Clusters For Each Genre, Top Sentiment and Top Emotion

In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score



spotify_features_list = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'anticipation', 'joy', 'surprise', 'trust', 'anger', 'disgust', 'fear', 'sadness', 'positive']

clusters = 2
for genre, songs_in_genre, in features_lyrics.groupby('mapped_genres'): # for each genre

    samples = len(songs_in_genre)
    if samples == 0: continue
   

    k = min(clusters, samples)

    spotify_features = songs_in_genre[spotify_features_list]

    scaler = StandardScaler()
    spotify_features_scaled = scaler.fit_transform(spotify_features)

    kmeans = KMeans(n_clusters = k, random_state = 42)
    label = kmeans.fit_predict(spotify_features_scaled)
            

    cluster_names = [f"{genre}_cluster_{i+1}" for i in label]

    features_lyrics.loc[songs_in_genre.index, 'cluster'] = cluster_names


            



### Analyzing & Classifying The Clusters

In [6]:
print("Number of Clusters", features_lyrics['cluster'].unique().size)
print(features_lyrics['cluster'].unique())
features_lyrics['cluster'].value_counts()


Number of Clusters 34
['Rap_cluster_2' 'Hip Hop_cluster_1' 'Hip Hop_cluster_2' 'Pop_cluster_1'
 'Electronic_cluster_1' 'Rock_cluster_1' 'Rock_cluster_2' 'Soul_cluster_2'
 'Metal_cluster_2' 'Country_cluster_2' 'Pop_cluster_2' 'Folk_cluster_2'
 'Funk_cluster_2' 'Soul_cluster_1' 'Metal_cluster_1' 'Disco_cluster_1'
 'Electronic_cluster_2' 'Rap_cluster_1' 'Blues_cluster_1' 'R&B_cluster_2'
 'R&B_cluster_1' 'Folk_cluster_1' 'Dance_cluster_2'
 'Alternative_cluster_2' 'Disco_cluster_2' 'Funk_cluster_1'
 'Jazz_cluster_1' 'Country_cluster_1' 'Motown_cluster_2'
 'Alternative_cluster_1' 'Jazz_cluster_2' 'Motown_cluster_1'
 'Blues_cluster_2' 'Dance_cluster_1']


cluster
Rock_cluster_2           8025
Pop_cluster_2            5033
Rap_cluster_2            2412
Rock_cluster_1           2340
Pop_cluster_1            2113
Metal_cluster_1          1287
Hip Hop_cluster_2        1098
Soul_cluster_1            598
Hip Hop_cluster_1         589
Country_cluster_1         580
R&B_cluster_2             501
Metal_cluster_2           388
Electronic_cluster_2      357
Folk_cluster_2            346
Folk_cluster_1            228
Country_cluster_2         195
Soul_cluster_2            158
Alternative_cluster_2     144
R&B_cluster_1             140
Jazz_cluster_2            136
Funk_cluster_2            135
Disco_cluster_1           124
Dance_cluster_2           106
Alternative_cluster_1     101
Blues_cluster_1            93
Jazz_cluster_1             85
Electronic_cluster_1       71
Funk_cluster_1             68
Rap_cluster_1              60
Motown_cluster_2           21
Disco_cluster_2            11
Blues_cluster_2            11
Motown_cluster_1            9
Da

In [7]:
cluster_avgs = features_lyrics.groupby('cluster')[spotify_features_list].mean()
cluster_avgs = (cluster_avgs - cluster_avgs.min()) / (cluster_avgs.max() - cluster_avgs.min())
print(cluster_avgs)



                       danceability    energy  loudness  speechiness  \
cluster                                                                
Alternative_cluster_1      0.322430  0.789353  0.921418     0.151191   
Alternative_cluster_2      0.490716  0.400137  0.532090     0.034308   
Blues_cluster_1            0.462519  0.360597  0.467027     0.079236   
Blues_cluster_2            0.846898  0.613475  0.752406     0.080288   
Country_cluster_1          0.505869  0.496462  0.661287     0.020998   
Country_cluster_2          0.563382  0.687330  0.848218     0.070585   
Dance_cluster_1            0.589776  0.973420  0.993931     0.878007   
Dance_cluster_2            0.728557  0.846945  0.934895     0.074911   
Disco_cluster_1            0.675651  0.455953  0.385876     0.038812   
Disco_cluster_2            0.925538  0.500966  0.343641     0.080288   
Electronic_cluster_1       0.968891  0.771503  0.820577     0.295793   
Electronic_cluster_2       0.722981  0.725407  0.880524     0.15

In [8]:
print(features_lyrics['cluster'].value_counts())
features_lyrics.head(10)

features_lyrics.to_csv('../data/processed/lyrics_features_clusters.csv', index=False)

cluster
Rock_cluster_2           8025
Pop_cluster_2            5033
Rap_cluster_2            2412
Rock_cluster_1           2340
Pop_cluster_1            2113
Metal_cluster_1          1287
Hip Hop_cluster_2        1098
Soul_cluster_1            598
Hip Hop_cluster_1         589
Country_cluster_1         580
R&B_cluster_2             501
Metal_cluster_2           388
Electronic_cluster_2      357
Folk_cluster_2            346
Folk_cluster_1            228
Country_cluster_2         195
Soul_cluster_2            158
Alternative_cluster_2     144
R&B_cluster_1             140
Jazz_cluster_2            136
Funk_cluster_2            135
Disco_cluster_1           124
Dance_cluster_2           106
Alternative_cluster_1     101
Blues_cluster_1            93
Jazz_cluster_1             85
Electronic_cluster_1       71
Funk_cluster_1             68
Rap_cluster_1              60
Motown_cluster_2           21
Disco_cluster_2            11
Blues_cluster_2            11
Motown_cluster_1            9
Da

In [9]:
group = features_lyrics.groupby(['cluster', 'year']).size().reset_index(name='count')

table = group.pivot_table(index='year', columns='cluster', values='count', fill_value=0)

with open('../data/processed/clusters.txt', 'w') as f:
    f.write(table.to_string())

### Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Top 5 Clusters Overall -> I did a heatmap of all five genres and their attributes

# Top 3 Clusters per Genre -> I did a multi-bar chart, where there were three groups of bars representing the three clusters, and in each group a singular bar represented a feature
                                # you have to normalize the values others the bars will come out messed up, also helps to set the lower range to 0.1 so the lower values are still visible)
