# Genre Clustering

In [1]:
import pandas as pd

### Merge Spotify Features & Lyrical dataset & Cleaning spotify genres

In [2]:
import statistics as stats
features_lyrics = pd.read_csv('../data/compressed/lyrics_spotify_features.csv')

song_emotions = pd.read_csv('../data/compressed/songs_emotions.csv')

features_lyrics = features_lyrics.merge(song_emotions, on= ['title', 'artist'], how='left')
features_lyrics.drop(columns=['lyrics'], inplace=True)



new_genres =  ["Alternative", "Blues", "Country", "Dance", "Disco", "EDM", "Electronic", "Folk", "Funk",
                  "Hip Hop", "House", "Indie", "Jazz", "Metal", "Motown", "Pop", "R&B", "Rap", "Rock", "Soul"]

ng_map = {g.lower(): g for g in new_genres}

genius_map = {'rb': 'R&B',
              'rock': 'Rock',
              'pop': 'Pop',
              'rap': 'Rap',}


merge_map = {"EDM": "Electronic",
             "House": "Electronic",
             "Indie": "Alternative"}

special_genres = ["Hip Hop", "R&B", "Motown"]

def get_main_genre(genre, genius_genre):
    if not genre or not isinstance(genre, str):
        
    
        if genius_genre:
            return genius_map.get(genius_genre.lower(), genius_genre.title())
        else:
            return ""
    cleaned = genre.replace("[", "").replace("]", "").replace("'", "").replace('"', '')

    genres = [genre.strip() for genre in cleaned.split(',') if genre.strip() != '']
  

    genres_list = []

    for genre in genres:
        genre_clean = genre.replace("_", " ").strip()
        if genre_clean == "":
            continue
        
        base_genre = None
        for special in special_genres:
            if special.lower() in genre_clean.lower():
                base_genre = special
                break
            
        if base_genre is None:
            last = genre_clean.split()[-1].lower()
            base_genre = ng_map.get(last, None)
            
                
        if base_genre in merge_map:
            base_genre = merge_map[base_genre]


        if base_genre is not None:
            genres_list.append(base_genre)

        
    if genres_list:
        final_genres = stats.mode(genres_list) # returns most common genre in a list per song
    else:
        if genius_genre:
            final_genres = genius_map.get(genius_genre.lower(), genius_genre.title())
     

    return final_genres




features_lyrics['mapped_genres'] = features_lyrics.apply(lambda x: get_main_genre(x['spotify_genre_list'], x['genius_genre']), axis=1)
features_lyrics = features_lyrics[features_lyrics['mapped_genres'] != 'Misc']

features_lyrics.drop(columns=['spotify_genre_list', 'genius_genre', 'top_emotion', 'topEmotionWord'], inplace=True)

print(features_lyrics['mapped_genres'].value_counts())
features_lyrics.head()
features_lyrics.columns


mapped_genres
Rock           10365
Pop             7146
Rap             2472
Hip Hop         1687
Metal           1675
Country          775
Soul             756
R&B              641
Folk             574
Electronic       428
Alternative      245
Jazz             221
Funk             203
Disco            135
Dance            114
Blues            104
Motown            30
Name: count, dtype: int64


Index(['title', 'artist', 'year', 'danceability', 'energy', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'emotions_scores', 'mapped_genres'],
      dtype='object')

## K-means Clustering Implementation

### Preparing Clustering

#### Converting emotion_scores into Dictionary

In [3]:
import ast
def convert_dictionary(x):
    if isinstance(x,str):
        return ast.literal_eval(x)
    return x
features_lyrics['emotions_scores'] = features_lyrics['emotions_scores'].apply(lambda x : convert_dictionary(x))


#### Fetching Top Sentiment & Cleaning

In [4]:
   
def maxSent(dict):

    if(dict.get('positive')==None):
        dict['positive']=0
    if(dict.get('negative')==None):
        dict['negative']=0
        

    if dict.get('positive')>dict.get('negative'):
        return 'positive'
    elif dict.get('positive')<dict.get('negative'):
        return 'negative'
    else:
        return 'neutral'


features_lyrics['top_sentiment'] = features_lyrics['emotions_scores'].apply(lambda x : maxSent(x))

features_lyrics['emotions_scores']



0        {'anticipation': 12, 'joy': 9, 'positive': 29,...
1        {'anger': 39, 'anticipation': 36, 'joy': 40, '...
2        {'anger': 11, 'fear': 19, 'negative': 32, 'sad...
3        {'anger': 27, 'disgust': 9, 'negative': 26, 'j...
4        {'anticipation': 8, 'joy': 19, 'positive': 28,...
                               ...                        
27615    {'anticipation': 6, 'negative': 7, 'sadness': ...
27616    {'surprise': 7, 'joy': 4, 'positive': 8, 'anti...
27617    {'positive': 24, 'trust': 19, 'anger': 20, 'di...
27618    {'positive': 4, 'trust': 3, 'anticipation': 2,...
27619    {'trust': 3, 'joy': 3, 'positive': 7, 'fear': ...
Name: emotions_scores, Length: 27571, dtype: object

#### Fetching Emotion Scores & Cleaning

In [5]:
# extracts emotion scores and stores them in separate columns
emotion_df = pd.json_normalize(features_lyrics['emotions_scores'])
emotion_df = emotion_df.fillna(0.0)
emotion_df.index = features_lyrics.index
features_lyrics = pd.concat([features_lyrics, emotion_df], axis=1)
features_lyrics.columns
features_lyrics[emotion_df.columns]




Unnamed: 0,anticipation,joy,positive,surprise,trust,anger,disgust,fear,negative,sadness
0,12.0,9.0,29,8.0,17.0,12.0,9.0,12.0,22,10.0
1,36.0,40.0,44,34.0,36.0,39.0,13.0,12.0,22,10.0
2,10.0,4.0,16,11.0,11.0,11.0,12.0,19.0,32,22.0
3,30.0,25.0,37,21.0,20.0,27.0,9.0,14.0,26,9.0
4,8.0,19.0,28,4.0,10.0,23.0,13.0,12.0,26,7.0
...,...,...,...,...,...,...,...,...,...,...
27615,6.0,5.0,6,2.0,5.0,2.0,1.0,2.0,7,4.0
27616,15.0,4.0,8,7.0,20.0,4.0,2.0,3.0,6,1.0
27617,16.0,12.0,24,5.0,19.0,20.0,14.0,23.0,35,22.0
27618,2.0,3.0,4,1.0,3.0,0.0,0.0,0.0,1,1.0


### Spotify Feature Clusters For Each Genre, Top Sentiment and Top Emotion

In [6]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler



spotify_features_list = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'anticipation', 'joy', 'surprise', 'trust', 'anger', 'disgust', 'fear', 'sadness']

clusters = 2
for genre, songs_in_genre, in features_lyrics.groupby('mapped_genres'): # for each genre

    for sentiment, sentiment_songs_group in songs_in_genre.groupby('top_sentiment'): # for each sentiment group in each genre

        samples = len(sentiment_songs_group)
        if samples == 0: continue

        k = min(clusters, samples)

        spotify_features = sentiment_songs_group[spotify_features_list]

        scaler = StandardScaler()
        spotify_features_scaled = scaler.fit_transform(spotify_features)

        kmeans = KMeans(n_clusters = k, random_state = 42)
        label = kmeans.fit_predict(spotify_features_scaled)
            

        cluster_names = [f"{genre}_{sentiment}_cluster_{i+1}" for i in label]

        features_lyrics.loc[sentiment_songs_group.index, 'cluster'] = cluster_names


            



### Analyzing & Classifying The Clusters

In [7]:
print("Number of Clusters", features_lyrics['cluster'].unique().size)
print(features_lyrics['cluster'].unique())


Number of Clusters 100
['Rap_positive_cluster_2' 'Rap_negative_cluster_1'
 'Hip Hop_positive_cluster_2' 'Rap_positive_cluster_1'
 'Hip Hop_negative_cluster_1' 'Rap_negative_cluster_2'
 'Hip Hop_negative_cluster_2' 'Rap_neutral_cluster_1'
 'Pop_positive_cluster_1' 'Hip Hop_positive_cluster_1'
 'Electronic_negative_cluster_2' 'Rock_negative_cluster_1'
 'Rock_negative_cluster_2' 'Soul_negative_cluster_1'
 'Rap_neutral_cluster_2' 'Pop_neutral_cluster_1' 'Pop_negative_cluster_2'
 'Hip Hop_neutral_cluster_2' 'Hip Hop_neutral_cluster_1'
 'Metal_negative_cluster_1' 'Country_negative_cluster_1'
 'Rock_positive_cluster_2' 'Pop_positive_cluster_2'
 'Rock_positive_cluster_1' 'Rock_neutral_cluster_1'
 'Folk_positive_cluster_1' 'Funk_negative_cluster_1'
 'Soul_positive_cluster_1' 'Electronic_positive_cluster_2'
 'Metal_positive_cluster_2' 'Disco_positive_cluster_1'
 'Rock_neutral_cluster_2' 'Soul_neutral_cluster_2' 'Pop_neutral_cluster_2'
 'Pop_negative_cluster_1' 'Blues_positive_cluster_1'
 'Soul_p

In [None]:
cluster_avgs = features_lyrics.groupby('cluster')[spotify_features_list].mean()
cluster_avgs = (cluster_avgs - cluster_avgs.min()) / (cluster_avgs.max() - cluster_avgs.min())
print(cluster_avgs)


                                danceability    energy  loudness  speechiness  \
cluster                                                                         
Alternative_negative_cluster_1      0.545746  0.477664  0.563485     0.108269   
Alternative_negative_cluster_2      0.524580  0.826497  0.896797     0.246434   
Alternative_neutral_cluster_1       0.504813  0.453410  0.621382     0.112179   
Alternative_neutral_cluster_2       0.110392  0.572776  0.822958     0.903667   
Alternative_positive_cluster_1      0.445161  0.652420  0.780220     0.196247   
...                                      ...       ...       ...          ...   
Soul_negative_cluster_2             0.604454  0.365637  0.473283     0.108178   
Soul_neutral_cluster_1              0.941780  0.848803  1.000000     0.318369   
Soul_neutral_cluster_2              0.613355  0.371104  0.513577     0.076748   
Soul_positive_cluster_1             0.653035  0.408807  0.518081     0.139801   
Soul_positive_cluster_2     

In [None]:
print(features_lyrics['cluster'].value_counts())
features_lyrics.head(10)

cluster
Rock_positive_cluster_2          4108
Rock_negative_cluster_2          3352
Pop_positive_cluster_2           2326
Pop_positive_cluster_1           1895
Pop_negative_cluster_1           1825
                                 ... 
Disco_neutral_cluster_2             1
Motown_negative_cluster_2           1
Funk_neutral_cluster_2              1
Blues_neutral_cluster_2             1
Alternative_neutral_cluster_2       1
Name: count, Length: 100, dtype: int64


### Visualization

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt
