# Genre Clustering

In [1]:
import pandas as pd

### Merge Spotify Features & Lyrical dataset

In [2]:
features_lyrics = pd.read_csv('../data/compressed/lyrics_spotify_features.csv')

song_emotions = pd.read_csv('../data/compressed/songs_emotions.csv')

features_lyrics = features_lyrics.merge(song_emotions, on= ['title', 'artist'], how='left')
features_lyrics.drop(columns=['lyrics'], inplace=True)

print(features_lyrics.isnull().sum())

features_lyrics.head()



title                    0
genius_genre             0
artist                   0
year                     0
danceability             0
energy                   0
loudness                 0
speechiness              0
acousticness             0
instrumentalness         0
liveness                 0
valence                  0
tempo                    0
spotify_genre_list    2529
top_emotion              0
topEmotionWord           0
emotions_scores          0
dtype: int64


Unnamed: 0,title,genius_genre,artist,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,spotify_genre_list,top_emotion,topEmotionWord,emotions_scores
0,Can I Live,rap,JAY-Z,1996,0.628,0.692,-12.365,0.437,0.0823,0.0,0.161,0.575,76.44,"['pop_rap', 'rap', 'east_coast_hip_hop', 'hip_...","[('positive', 0.20714285714285716)]",positive,"{'anticipation': 12, 'joy': 9, 'positive': 29,..."
1,Money On My Mind,rap,Lil Wayne,2005,0.535,0.772,-6.503,0.37,0.0127,0.0,0.11,0.661,152.173,"['trap', 'rap', 'pop_rap', 'hip_hop', 'new_orl...","[('positive', 0.15384615384615385)]",positive,"{'anger': 39, 'anticipation': 36, 'joy': 40, '..."
2,Mr. Carter,rap,Lil Wayne,2008,0.485,0.71,-6.288,0.364,0.0444,0.0,0.35,0.473,170.942,"['trap', 'east_coast_hip_hop', 'rap', 'pop_rap...","[('negative', 0.21621621621621623)]",negative,"{'anger': 11, 'fear': 19, 'negative': 32, 'sad..."
3,C.R.E.A.M.,rap,Wu-Tang Clan,1994,0.479,0.549,-10.551,0.373,0.57,0.0239,0.127,0.576,180.985,"['east_coast_hip_hop', 'gangster_rap', 'hardco...","[('positive', 0.16972477064220184)]",positive,"{'anger': 27, 'disgust': 9, 'negative': 26, 'j..."
4,Barry Bonds,rap,Kanye West,2007,0.48,0.624,-6.131,0.382,0.0451,0.0,0.337,0.704,165.057,"['chicago_rap', 'trap', 'rap', 'pop_rap', 'hip...","[('positive', 0.18666666666666668)]",positive,"{'anticipation': 8, 'joy': 19, 'positive': 28,..."


### Cleaning spotify genres

In [3]:
new_genres =  ["Alternative", "Blues", "Country", "Dance", "Disco", "EDM", "Electronic", "Folk", "Funk",
                  "Hip Hop", "House", "Indie", "Jazz", "Metal", "Motown", "Pop", "R&B", "Rap", "Rock", "Soul"]

ng_map = {g.lower(): g for g in new_genres}

genius_map = {'rb': 'R&B',
              'rock': 'Rock',
              'pop': 'Pop',
              'rap': 'Rap',}


merge_map = {"EDM": "Electronic",
             "House": "Electronic",
             "Indie": "Alternative"}

special_genres = ["Hip Hop", "R&B", "Motown"]

def get_main_genre(genre, genius_genre):
    if not genre or not isinstance(genre, str):
        
    
        if genius_genre:
            return genius_map.get(genius_genre.lower(), genius_genre.title())
        else:
            return ""
    cleaned = genre.replace("[", "").replace("]", "").replace("'", "").replace('"', '')

    genres = [genre.strip() for genre in cleaned.split(',') if genre.strip() != '']
  

    genres_list = []

    for genre in genres:
        genre_clean = genre.replace("_", " ").strip()
        if genre_clean == "":
            continue
        
        base_genre = None
        for special in special_genres:
            if special.lower() in genre_clean.lower():
                base_genre = special
                break
            
        if base_genre is None:
            last = genre_clean.split()[-1].lower()
            base_genre = ng_map.get(last, None)
            
                
        if base_genre in merge_map:
            base_genre = merge_map[base_genre]


        if base_genre is not None:
            genres_list.append(base_genre)

        
    if genres_list:
        final_genres = max(set(genres_list), key=genres_list.count)
    else:
        if genius_genre:
            final_genres = genius_map.get(genius_genre.lower(), genius_genre.title())
     

    return final_genres




features_lyrics['mapped_genres'] = features_lyrics.apply(lambda x: get_main_genre(x['spotify_genre_list'], x['genius_genre']), axis=1)
features_lyrics = features_lyrics[features_lyrics['mapped_genres'] != 'Misc']

features_lyrics.drop(columns=['spotify_genre_list', 'genius_genre', 'top_emotion', 'topEmotionWord'], inplace=True)

print(features_lyrics['mapped_genres'].value_counts())
features_lyrics.head()
features_lyrics.columns
features_lyrics['danceability'].isna().sum()


mapped_genres
Rock           11045
Pop             6892
Rap             2314
Hip Hop         1962
Metal           1604
Country          796
Soul             752
Electronic       420
R&B              419
Alternative      395
Folk             360
Jazz             186
Motown           121
Funk             104
Blues             70
Dance             70
Disco             61
Name: count, dtype: int64


np.int64(0)

### K-means Clustering Implementation

### Preparing Clustering

#### Converting emotion_scores into Dictionary

In [4]:
import ast
def convert_dictionary(x):
    if isinstance(x,str):
        return ast.literal_eval(x)
    return x
features_lyrics['emotions_scores'] = features_lyrics['emotions_scores'].apply(lambda x : convert_dictionary(x))


#### Fetching Top Sentiment & Cleaning

In [5]:
   
def maxSent(dict):

    if(dict.get('positive')==None):
        dict['positive']=0
    if(dict.get('negative')==None):
        dict['negative']=0
        

    if dict.get('positive')>dict.get('negative'):
        return 'positive'
    elif dict.get('positive')<dict.get('negative'):
        return 'negative'
    else:
        return 'neutral'


features_lyrics['top_sentiment'] = features_lyrics['emotions_scores'].apply(lambda x : maxSent(x))

features_lyrics['emotions_scores']



0        {'anticipation': 12, 'joy': 9, 'positive': 29,...
1        {'anger': 39, 'anticipation': 36, 'joy': 40, '...
2        {'anger': 11, 'fear': 19, 'negative': 32, 'sad...
3        {'anger': 27, 'disgust': 9, 'negative': 26, 'j...
4        {'anticipation': 8, 'joy': 19, 'positive': 28,...
                               ...                        
27615    {'anticipation': 6, 'negative': 7, 'sadness': ...
27616    {'surprise': 7, 'joy': 4, 'positive': 8, 'anti...
27617    {'positive': 24, 'trust': 19, 'anger': 20, 'di...
27618    {'positive': 4, 'trust': 3, 'anticipation': 2,...
27619    {'trust': 3, 'joy': 3, 'positive': 7, 'fear': ...
Name: emotions_scores, Length: 27571, dtype: object

#### Fetching Emotion Scores & Cleaning

In [6]:
# extracts emotion scores and stores them in separate columns
emotion_df = pd.json_normalize(features_lyrics['emotions_scores'])
emotion_df = emotion_df.fillna(0.0)
emotion_df.index = features_lyrics.index
features_lyrics = pd.concat([features_lyrics, emotion_df], axis=1)
features_lyrics.columns
features_lyrics[emotion_df.columns]




Unnamed: 0,anticipation,joy,positive,surprise,trust,anger,disgust,fear,negative,sadness
0,12.0,9.0,29,8.0,17.0,12.0,9.0,12.0,22,10.0
1,36.0,40.0,44,34.0,36.0,39.0,13.0,12.0,22,10.0
2,10.0,4.0,16,11.0,11.0,11.0,12.0,19.0,32,22.0
3,30.0,25.0,37,21.0,20.0,27.0,9.0,14.0,26,9.0
4,8.0,19.0,28,4.0,10.0,23.0,13.0,12.0,26,7.0
...,...,...,...,...,...,...,...,...,...,...
27615,6.0,5.0,6,2.0,5.0,2.0,1.0,2.0,7,4.0
27616,15.0,4.0,8,7.0,20.0,4.0,2.0,3.0,6,1.0
27617,16.0,12.0,24,5.0,19.0,20.0,14.0,23.0,35,22.0
27618,2.0,3.0,4,1.0,3.0,0.0,0.0,0.0,1,1.0


### Spotify Feature Clusters For Each Genre, Top Sentiment and Top Emotion

In [7]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler



spotify_features_list = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'anticipation', 'joy', 'surprise', 'trust', 'anger', 'disgust', 'fear', 'sadness']

clusters = 2
for genre, songs_in_genre, in features_lyrics.groupby('mapped_genres'): # for each genre

    for sentiment, sentiment_songs_group in songs_in_genre.groupby('top_sentiment'): # for each sentiment group in each genre

        samples = len(sentiment_songs_group)
        if samples == 0: continue

        k = min(clusters, samples)

        spotify_features = sentiment_songs_group[spotify_features_list]

        scaler = StandardScaler()
        spotify_features_scaled = scaler.fit_transform(spotify_features)

        kmeans = KMeans(n_clusters = k, random_state = 42)
        label = kmeans.fit_predict(spotify_features_scaled)
            

        cluster_names = [f"{genre}_{sentiment}_cluster_{i+1}" for i in label]

        features_lyrics.loc[sentiment_songs_group.index, 'cluster'] = cluster_names


            



### Analyzing & Classifying The Clusters

In [8]:
print("Number of Clusters", features_lyrics['cluster'].unique().size)
print(features_lyrics['cluster'].unique())

Number of Clusters 101
['Hip Hop_positive_cluster_2' 'Rap_positive_cluster_1'
 'Rap_negative_cluster_2' 'Hip Hop_positive_cluster_1'
 'Rap_positive_cluster_2' 'Hip Hop_negative_cluster_1'
 'Rap_negative_cluster_1' 'Hip Hop_negative_cluster_2'
 'Rap_neutral_cluster_1' 'Pop_positive_cluster_2'
 'Rock_negative_cluster_2' 'Rock_negative_cluster_1'
 'Hip Hop_neutral_cluster_1' 'Rap_neutral_cluster_2'
 'Pop_neutral_cluster_2' 'Pop_negative_cluster_2'
 'Hip Hop_neutral_cluster_2' 'Metal_negative_cluster_1'
 'Country_negative_cluster_1' 'Rock_positive_cluster_2'
 'Pop_positive_cluster_1' 'Rock_positive_cluster_1'
 'Alternative_positive_cluster_2' 'Rock_neutral_cluster_2'
 'Alternative_negative_cluster_1' 'Soul_negative_cluster_1'
 'Soul_positive_cluster_1' 'Electronic_positive_cluster_1'
 'Metal_positive_cluster_1' 'Rock_neutral_cluster_1'
 'Pop_neutral_cluster_1' 'Pop_negative_cluster_1'
 'Electronic_negative_cluster_1' 'Electronic_negative_cluster_2'
 'Blues_positive_cluster_1' 'Soul_positiv

In [None]:
cluster_avgs = features_lyrics.groupby('cluster')[spotify_features_list].mean()
cluster_avgs = (cluster_avgs - cluster_avgs.min()) / (cluster_avgs.max() - cluster_avgs.min())
print(cluster_avgs)

                          title                artist  year  danceability  \
2557           The Boys of Fall         Kenny Chesney  2010         0.503   
2751               Need You Now                Lady A  2009         0.581   
3623               Ring of Fire           Johnny Cash  1963         0.659   
3625                San Quentin           Johnny Cash  1969         0.487   
3848      When I Call Your Name            Vince Gill  1990         0.506   
...                         ...                   ...   ...           ...   
27224         Anything She Says     Mitchell Tenpenny  2019         0.612   
27283                Hell Right         Blake Shelton  2019         0.551   
27355  Things a Man Oughta Know         Lainey Wilson  2019         0.659   
27559               Union Dixie  Tennessee Ernie Ford  1961         0.795   
27592        Slightly Hung Over         Blues Delight  2006         0.666   

       energy  loudness  speechiness  acousticness  instrumentalness  \
255

In [14]:
print(features_lyrics['cluster'].value_counts())
features_lyrics.head(20)

cluster
Rock_positive_cluster_2     4439
Rock_negative_cluster_1     3616
Pop_positive_cluster_1      2858
Pop_negative_cluster_1      1723
Pop_positive_cluster_2      1255
                            ... 
Funk_negative_cluster_1        1
Dance_negative_cluster_1       1
Motown_neutral_cluster_2       1
Disco_neutral_cluster_1        1
Blues_neutral_cluster_2        1
Name: count, Length: 101, dtype: int64


Unnamed: 0,title,artist,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,...,joy,positive,surprise,trust,anger,disgust,fear,negative,sadness,cluster
0,Can I Live,JAY-Z,1996,0.628,0.692,-12.365,0.437,0.0823,0.0,0.161,...,9.0,29,8.0,17.0,12.0,9.0,12.0,22,10.0,Hip Hop_positive_cluster_2
1,Money On My Mind,Lil Wayne,2005,0.535,0.772,-6.503,0.37,0.0127,0.0,0.11,...,40.0,44,34.0,36.0,39.0,13.0,12.0,22,10.0,Rap_positive_cluster_1
2,Mr. Carter,Lil Wayne,2008,0.485,0.71,-6.288,0.364,0.0444,0.0,0.35,...,4.0,16,11.0,11.0,11.0,12.0,19.0,32,22.0,Rap_negative_cluster_2
3,C.R.E.A.M.,Wu-Tang Clan,1994,0.479,0.549,-10.551,0.373,0.57,0.0239,0.127,...,25.0,37,21.0,20.0,27.0,9.0,14.0,26,9.0,Hip Hop_positive_cluster_1
4,Barry Bonds,Kanye West,2007,0.48,0.624,-6.131,0.382,0.0451,0.0,0.337,...,19.0,28,4.0,10.0,23.0,13.0,12.0,26,7.0,Rap_positive_cluster_1
5,Fireman,Lil Wayne,2005,0.682,0.723,-4.628,0.211,0.0109,0.0,0.406,...,9.0,11,7.0,19.0,7.0,0.0,20.0,10,5.0,Rap_positive_cluster_2
6,Juicy,The Notorious B.I.G.,1994,0.887,0.767,-4.311,0.295,0.38,0.0,0.0527,...,27.0,34,22.0,26.0,10.0,7.0,10.0,17,11.0,Hip Hop_positive_cluster_1
7,The What,The Notorious B.I.G.,1994,0.622,0.714,-8.244,0.411,0.191,0.0,0.106,...,6.0,18,4.0,9.0,21.0,17.0,16.0,27,12.0,Hip Hop_negative_cluster_1
8,We Gonna Make It,Jadakiss,2001,0.514,0.961,-3.726,0.347,0.178,2e-06,0.146,...,11.0,20,9.0,16.0,16.0,14.0,18.0,29,10.0,Rap_negative_cluster_1
9,Back That Azz Up,Juvenile,1998,0.874,0.714,-6.664,0.129,0.0155,0.0,0.0968,...,15.0,19,14.0,17.0,29.0,13.0,15.0,60,14.0,Rap_negative_cluster_1


### Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
