# Genre Clustering

In [2]:
import pandas as pd

### Merge Spotify Features & Lyrical dataset

In [3]:
features_lyrics = pd.read_csv('../data/compressed/lyrics_spotify_features.csv')

song_emotions = pd.read_csv('../data/compressed/songs_emotions.csv')

features_lyrics = features_lyrics.merge(song_emotions, on= ['title', 'artist'], how='left')
features_lyrics.drop(columns=['lyrics'], inplace=True)

print(features_lyrics.isnull().sum())

features_lyrics.head()



title                 0
genius_genre          0
artist                0
year                  0
danceability          0
energy                0
loudness              0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
spotify_genre_list    0
party_potential       0
chill_score           0
top_emotion           0
topEmotionWord        0
emotions_scores       0
dtype: int64


Unnamed: 0,title,genius_genre,artist,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,spotify_genre_list,party_potential,chill_score,top_emotion,topEmotionWord,emotions_scores
0,Can I Live,rap,JAY-Z,1996,0.628,0.692,-12.365,0.437,0.0823,0.0,0.161,0.575,76.44,"['pop_rap', 'rap', 'east_coast_hip_hop', 'hip_...",0.249881,0.010773,"[('positive', 0.20714285714285716)]",positive,"{'anticipation': 12, 'joy': 9, 'positive': 29,..."
1,Money On My Mind,rap,Lil Wayne,2005,0.535,0.772,-6.503,0.37,0.0127,0.0,0.11,0.661,152.173,"['trap', 'rap', 'pop_rap', 'hip_hop', 'new_orl...",0.273006,0.000982,"[('positive', 0.15384615384615385)]",positive,"{'anger': 39, 'anticipation': 36, 'joy': 40, '..."
2,Mr. Carter,rap,Lil Wayne,2008,0.485,0.71,-6.288,0.364,0.0444,0.0,0.35,0.473,170.942,"['trap', 'east_coast_hip_hop', 'rap', 'pop_rap...",0.162878,0.006786,"[('negative', 0.21621621621621623)]",negative,"{'anger': 11, 'fear': 19, 'negative': 32, 'sad..."
3,C.R.E.A.M.,rap,Wu-Tang Clan,1994,0.479,0.549,-10.551,0.373,0.57,0.0239,0.127,0.576,180.985,"['east_coast_hip_hop', 'gangster_rap', 'hardco...",0.151471,0.108998,"[('positive', 0.16972477064220184)]",positive,"{'anger': 27, 'disgust': 9, 'negative': 26, 'j..."
4,Barry Bonds,rap,Kanye West,2007,0.48,0.624,-6.131,0.382,0.0451,0.0,0.337,0.704,165.057,"['chicago_rap', 'trap', 'rap', 'pop_rap', 'hip...",0.210862,0.005019,"[('positive', 0.18666666666666668)]",positive,"{'anticipation': 8, 'joy': 19, 'positive': 28,..."


### Cleaning spotify genres

In [4]:
new_genres =  ["Alternative", "Blues", "Country", "Dance", "Disco", "EDM", "Electronic", "Folk", "Funk",
                  "Hip Hop", "House", "Indie", "Jazz", "Metal", "Motown", "Pop", "R&B", "Rap", "Rock", "Soul"]

ng_map = {g.lower(): g for g in new_genres}

genius_map = {'rb': 'R&B',
              'rock': 'Rock',
              'pop': 'Pop',
              'rap': 'Rap',}


merge_map = {"EDM": "Electronic",
             "House": "Electronic",
             "Indie": "Alternative"}

special_genres = ["Hip Hop", "R&B", "Motown"]

def get_main_genre(genre, genius_genre):
    if not genre or not isinstance(genre, str):
        
    
        if genius_genre:
            return genius_map.get(genius_genre.lower(), genius_genre.title())
        else:
            return ""
    cleaned = genre.replace("[", "").replace("]", "").replace("'", "").replace('"', '')

    genres = [genre.strip() for genre in cleaned.split(',') if genre.strip() != '']
  

    genres_list = []

    for genre in genres:
        genre_clean = genre.replace("_", " ").strip()
        if genre_clean == "":
            continue
        
        base_genre = None
        for special in special_genres:
            if special.lower() in genre_clean.lower():
                base_genre = special
                break
            
        if base_genre is None:
            last = genre_clean.split()[-1].lower()
            base_genre = ng_map.get(last, None)
            
                
        if base_genre in merge_map:
            base_genre = merge_map[base_genre]


        if base_genre is not None:
            genres_list.append(base_genre)

        
    if genres_list:
        final_genres = max(set(genres_list), key=genres_list.count)
    else:
        if genius_genre:
            final_genres = genius_map.get(genius_genre.lower(), genius_genre.title())
     

    return final_genres




features_lyrics['mapped_genres'] = features_lyrics.apply(lambda x: get_main_genre(x['spotify_genre_list'], x['genius_genre']), axis=1)
features_lyrics = features_lyrics[features_lyrics['mapped_genres'] != 'Misc']

features_lyrics.drop(columns=['spotify_genre_list', 'genius_genre', 'top_emotion', 'topEmotionWord'], inplace=True)

print(features_lyrics['mapped_genres'].value_counts())
features_lyrics.head()


mapped_genres
Rock           11242
Pop             6745
Rap             2228
Hip Hop         1904
Metal           1615
Country          798
Soul             647
R&B              453
Electronic       422
Folk             358
Funk             290
Jazz             254
Alternative      247
Dance            208
Blues             87
Disco             39
Motown            34
Name: count, dtype: int64


Unnamed: 0,title,artist,year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,party_potential,chill_score,emotions_scores,mapped_genres
0,Can I Live,JAY-Z,1996,0.628,0.692,-12.365,0.437,0.0823,0.0,0.161,0.575,76.44,0.249881,0.010773,"{'anticipation': 12, 'joy': 9, 'positive': 29,...",Hip Hop
1,Money On My Mind,Lil Wayne,2005,0.535,0.772,-6.503,0.37,0.0127,0.0,0.11,0.661,152.173,0.273006,0.000982,"{'anger': 39, 'anticipation': 36, 'joy': 40, '...",Rap
2,Mr. Carter,Lil Wayne,2008,0.485,0.71,-6.288,0.364,0.0444,0.0,0.35,0.473,170.942,0.162878,0.006786,"{'anger': 11, 'fear': 19, 'negative': 32, 'sad...",Rap
3,C.R.E.A.M.,Wu-Tang Clan,1994,0.479,0.549,-10.551,0.373,0.57,0.0239,0.127,0.576,180.985,0.151471,0.108998,"{'anger': 27, 'disgust': 9, 'negative': 26, 'j...",Hip Hop
4,Barry Bonds,Kanye West,2007,0.48,0.624,-6.131,0.382,0.0451,0.0,0.337,0.704,165.057,0.210862,0.005019,"{'anticipation': 8, 'joy': 19, 'positive': 28,...",Rap


### K-means Clustering Implementation

### Preparing Clustering

In [None]:
import ast
features_lyrics['emotions_scores'] = features_lyrics['emotions_scores'].apply(lambda x : ast.literal_eval(x))


ValueError: malformed node or string: {'anticipation': 12, 'joy': 9, 'positive': 29, 'surprise': 8, 'trust': 17, 'anger': 12, 'disgust': 9, 'fear': 12, 'negative': 22, 'sadness': 10}

In [None]:

     
def maxSent(dict):

    if(dict.get('positive')==None)
        

    if dict.get('positive')>dict.get('negative'):
        return 'positive'
    elif dict.get('positive')<dict.get('negative'):
        return 'negative'
    else:
        return 'neutral'
    

features_lyrics['top_sentiment'] = features_lyrics['emotions_scores'].apply(lambda x : maxSent(x))

TypeError: '>' not supported between instances of 'int' and 'NoneType'

### Spotify Feature Clusters For Each Genre, Top Sentiment and Top Emotion

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


genre_groups = features_lyrics.groupby('mapped_genres')
sentiment_groups = features_lyrics.groupby('top_sentiment')
emotion_groups = features_lyrics.groupby('top_emotion')

spotify_features_list = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']


for genre, songs_in_genre, in genre_groups: # for each genre

    for sentiment, sentiment_songs_group in sentiment_groups: # for each sentiment group in each genre

        for emotion, group in emotion_groups: # for each emotion group in each sentiment group in each genre

            spotify_features = group[spotify_features_list]

            scaler = StandardScaler()
            spotify_features_scaled = scaler.fit_transform(spotify_features)

            kmeans = KMeans(n_clusters = 3, random_state = 42)
            label = kmeans.fit_predict(spotify_features_scaled)
            

            cluster_names = [f"{genre}_{sentiment}_{emotion}_cluster_{i+1}" for i in label]

            features_lyrics.loc[group.index, 'cluster'] = cluster_names



### Analyzing & Classifying The Clusters

In [None]:
features_lyrics['clusters'].value_counts()

cluster_avgs = features_lyrics.groupby('clusters')[spotify_features_list].mean()
cluster_avgs = (cluster_avgs - cluster_avgs.min()) / (cluster_avgs.max() - cluster_avgs.min())
print(cluster_avgs)

### Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
