# Data Analysis

In [1]:
import pandas as pd
import numpy as np
import csv
import ast

In [2]:
# Load the CSV file
all_songs = pd.read_csv("output_analysis\\all_songs.csv")

# Load umbrella genre map
map = {}

with open('output_analysis\\genre_map_2.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        labels = row[1].split("; ")
        for label in labels:
            map[label] = row[0]
len(map)

256

In [3]:
# Parse unconventional genres

all_songs['features'] = all_songs['features'].apply(ast.literal_eval)

all_songs.loc[all_songs['genre'] == 'alternative r&amp;b', 'genre'] = 'alternative r&b'
all_songs.loc[all_songs['genre'] == 'chinese r&amp;b', 'genre'] = 'chinese r&b'
all_songs.loc[all_songs['genre'] == 'j-r&amp;b', 'genre'] = 'j-r&b'
all_songs.loc[all_songs['genre'] == 'indie r&amp;b', 'genre'] = 'indie r&b'
all_songs.loc[all_songs['genre'] == 'r&amp;b', 'genre'] = 'r&b'

all_songs = all_songs[all_songs['genre'] != 'rock en español']

In [4]:
# Add Umbrella Genre
all_songs['umbrella_genre'] = all_songs['genre'].apply(lambda umbrella_genre: map[umbrella_genre])

# Reformat Dataframe
all_songs.rename(columns={'genre': 'subgenre'}, inplace=True)
features = all_songs['subgenre'].apply(lambda umbrella_genre: map[umbrella_genre])

In [5]:
import re
output_songs = []

with open("output_analysis\\output.csv", encoding="utf-8") as file:
    for line in file:
        name = re.search("\".*\"", line).group().split(" - ")[1][:-5]
        output_songs.append(name)

In [6]:
filtered_df = all_songs[all_songs['name'].isin(output_songs)]
filtered_df = filtered_df[filtered_df['subgenre'] != "No Labeled Genre"]

In [21]:
file = open(f"output_analysis/output-spotify.csv", "w", encoding='utf-8')
for index, row in filtered_df.iterrows():
    count = 0
    for data in row['features']:
        if count == 0 or count == 3 or count == 6:
            count += 1
            continue
        file.write(str(data) + " ")
        count += 1
    file.write(f"\"{row['artist']} - {row['name']}\"\n")

file.close()

In [7]:
import os

def read_all_clusters(path, column_name, spotify=False):
    all_folders = os.listdir(path)
    count = 0

    index = 65
    extension_skip = -5

    if spotify:
        index = 8
        extension_skip = -1

    for each_cluster in all_folders:
        file = open(path + each_cluster, "r", encoding='utf-8')
        for line in file:
            if line[0:2] == "ID":
                s = line.split(" ")
                field = " ".join(s[index:])
                s = field.split(" - ")
                artist = s[0]
                name = s[1][:extension_skip]
                filtered_df.loc[(filtered_df['name'] == name), column_name] = count

        count += 1

In [8]:
column_name = 'CNN-Cluster-Cos-30'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-FINAL-CNN-30-COS\\", column_name)

column_name = 'CNN-Cluster-Cos-252'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-FINAL-CNN-252-COS\\", column_name)

In [9]:
column_name = 'Linear-Cluster-Cos-30'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-FINAL-Linear-30-COS\\", column_name)

column_name = 'Linear-Cluster-Cos-252'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-FINAL-Linear-252-COS\\", column_name)

In [10]:
column_name = 'Classification-30'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Classification-With-Reconstruction-30\\", column_name)

column_name = 'Classification-252'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Classification-With-Reconstruction-252\\", column_name)

In [11]:
column_name = 'Linear-mtg-only-FlashAttention-CNN-2048-30'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Linear-mtg-only-FlashAttention-CNN-2048-30\\", column_name)

column_name = 'Linear-mtg-only-FlashAttention-CNN-2048-252'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Linear-mtg-only-FlashAttention-CNN-2048-252\\", column_name)

In [12]:
column_name = 'Linear-mtg-only-FlashAttention-Triplet-loss-30'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Linear-mtg-only-FlashAttention-Triplet-loss-30\\", column_name)

column_name = 'Linear-mtg-only-FlashAttention-Triplet-loss-252'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Linear-mtg-only-FlashAttention-Triplet-loss-252\\", column_name)

In [13]:
column_name = 'Linear-mtg-only-30'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Linear-mtg-only-30\\", column_name)

column_name = 'Linear-mtg-only-252'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Linear-mtg-only-252\\", column_name)

In [14]:
column_name = 'Triplet-Fixed-30'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Linear-mtg-only-FlashAttention-Triplet-Quartic-Push-Quadratic-Pull-FIX-30\\", column_name)

column_name = 'Triplet-Fixed-252'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Linear-mtg-only-FlashAttention-Triplet-Quartic-Push-Quadratic-Pull-FIX-252\\", column_name)

In [15]:
column_name = 'Linear-Masks-Cluster-Cos-30'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Linear-30-COS-MASKS\\", column_name)

column_name = 'Linear-Masks-Cluster-Cos-252'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-TEST-Linear-252-COS-MASKS\\", column_name)

In [16]:
column_name = 'Spotify-Cluster-Euc-30'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-FINAL-Spotify-30-EUC\\", column_name, spotify=True)

column_name = 'Spotify-Cluster-Euc-252'
filtered_df[column_name] = 0
read_all_clusters("output_analysis\\elki-FINAL-Spotify-252-EUC\\", column_name, spotify=True)

In [17]:
random_30 = "Random-30"
filtered_df[random_30] = np.random.randint(low=0, high=30, size=len(filtered_df))

random_252 = "Random-252"
filtered_df[random_252] = np.random.randint(low=0, high=252, size=len(filtered_df))

In [20]:
filtered_df

Unnamed: 0,name,artist,id,subgenre,features,umbrella_genre,CNN-Cluster-Cos-30,CNN-Cluster-Cos-252,Linear-Cluster-Cos-30,Linear-Cluster-Cos-252,...,Linear-Bidirectional-Cos-30,Linear-Bidirectional-Cos-252,Linear-Masks-Cluster-Cos-30,Linear-Masks-Cluster-Cos-252,Linear1024-Cluster-Cos-30,Linear1024-Cluster-Cos-252,Spotify-Cluster-Euc-30,Spotify-Cluster-Euc-252,Random-30,Random-252
3,Everybody Wants To Rule The World,Tears For Fears,4RvWPyQ5RL0ao9LPZeSouE,new wave,"[112, 80, 65, -12, 10, 54, 251, 35, 5, 85]",Dance Pop & Retro Pop,5,162,27,180,...,8,151,20,136,3,113,22,56,18,80
5,Take on Me,a-ha,2WfaOiMkCvy7F5fcp2zZ8L,new wave,"[84, 90, 57, -8, 9, 88, 225, 2, 5, 84]",Dance Pop & Retro Pop,20,101,19,108,...,18,154,29,110,22,194,12,57,12,13
6,The Less I Know The Better,Tame Impala,6K4t31amVTZDgR3sKmwUJJ,indie,"[117, 74, 64, -4, 17, 79, 216, 1, 3, 84]",Alternative & Indie Rock,10,63,15,151,...,18,53,29,94,15,73,12,216,12,207
8,Losing My Religion,R.E.M.,31AOj9sFz2gM0O3hMARRBx,jangle pop,"[126, 86, 67, -5, 10, 80, 268, 18, 3, 83]","Global Pop, Art, Indie & Bedroom Pop",20,36,3,135,...,11,191,17,109,10,155,12,216,18,4
9,Highway to Hell,AC/DC,2zYzyRzz6pRmhPzyfMEC8s,rock,"[116, 91, 57, -5, 16, 42, 208, 6, 13, 83]",Classic Rock & Mainstream Rock,23,162,7,180,...,10,148,20,136,2,102,15,48,9,214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4267,Ličiš Mi Na Nju,Acko,0JmCo343iWfVPVEhIt8RxH,shibuya-kei,"[131, 44, 47, -13, 14, 79, 179, 99, 5, 0]","Global Pop, Art, Indie & Bedroom Pop",23,130,4,145,...,16,52,18,63,17,201,24,191,20,25
4270,Life,strxwberrymilk,6fN2Mub6flbFQKm6z8Zith,breakcore,"[150, 72, 76, -6, 10, 60, 141, 0, 32, 0]","Breakbeat, Jungle & DnB",16,187,15,58,...,4,142,23,49,19,181,29,195,5,19
4289,Intro,Clean Tears,3DGzfsPHVHoJoeVuTlYgED,vocaloid,"[128, 90, 84, -6, 34, 53, 146, 3, 10, 0]",Electronic Mainstream,15,163,28,188,...,28,202,6,168,16,179,25,251,20,204
4317,Get Down,The Noisy Freaks,62DOE0zotqhiuIa9L5m9BV,electro swing,"[107, 87, 68, -5, 35, 84, 188, 3, 24, 0]","Synthwave, Retro & Alt-Electro",29,132,12,199,...,27,215,24,164,22,194,25,251,27,13


In [18]:
def umbrella_genre_cohesion(df, cluster_column_name):
    scores = []

    for cluster_id, group in df.groupby(cluster_column_name):
        genre_counts = group['umbrella_genre'].value_counts(normalize=True)
        entropy = -np.sum(genre_counts * np.log2(genre_counts))
        max_entropy = np.log2(len(genre_counts)) if len(genre_counts) > 1 else 1
        score = 1 - (entropy / max_entropy)
        scores.append(score)

    return np.mean(scores)

In [19]:
def sub_genre_cohesion(df, cluster_column_name):
    scores = []

    for cluster_id, group in df.groupby(cluster_column_name):
        genre_counts = group['subgenre'].value_counts(normalize=True)
        entropy = -np.sum(genre_counts * np.log2(genre_counts))
        max_entropy = np.log2(len(genre_counts)) if len(genre_counts) > 1 else 1
        score = 1 - (entropy / max_entropy)
        scores.append(score)

    return np.mean(scores)

In [20]:
def subgenre_partitioning_score(df, cluster_column_name):
    umbrella_genres = df['umbrella_genre'].unique()
    overall_scores = []

    for genre in umbrella_genres:
        genre_df = df[df['umbrella_genre'] == genre]
        cluster_scores = []

        for cluster_id, group in genre_df.groupby(cluster_column_name):
            sub_counts = group['subgenre'].value_counts(normalize=True)
            entropy = -np.sum(sub_counts * np.log2(sub_counts))
            max_entropy = np.log2(len(sub_counts)) if len(sub_counts) > 1 else 1
            score = 1 - (entropy / max_entropy)
            cluster_scores.append(score)

        if cluster_scores:
            genre_avg = np.mean(cluster_scores)
            overall_scores.append(genre_avg)

    return np.mean(overall_scores)

In [21]:
def run_statistics(column_name):
    genre_score = umbrella_genre_cohesion(filtered_df, column_name)
    #subgenre_score = subgenre_partitioning_score(filtered_df, column_name)
    subgenre_score2 = sub_genre_cohesion(filtered_df, column_name)

    print(f"{column_name} \t\tUmbrella Genre Cohesion: {genre_score:.3f}\t\t Subgenre Purity Score: {subgenre_score2:.3f}")

In [22]:
run_statistics('CNN-Cluster-Cos-30')
run_statistics('Linear-Cluster-Cos-30')
run_statistics("Linear-mtg-only-30")
run_statistics("Classification-30")
run_statistics("Triplet-Fixed-30")
run_statistics('Spotify-Cluster-Euc-30')
run_statistics('Random-30')
print("\n")
run_statistics('CNN-Cluster-Cos-252')
run_statistics('Linear-Cluster-Cos-252')
run_statistics("Linear-mtg-only-252")
run_statistics("Classification-252")
run_statistics("Triplet-Fixed-252")
run_statistics('Spotify-Cluster-Euc-252')
run_statistics('Random-252')

CNN-Cluster-Cos-30 		Umbrella Genre Cohesion: 0.144		 Subgenre Purity Score: 0.105
Linear-Cluster-Cos-30 		Umbrella Genre Cohesion: 0.155		 Subgenre Purity Score: 0.115
Linear-mtg-only-30 		Umbrella Genre Cohesion: 0.170		 Subgenre Purity Score: 0.124
Classification-30 		Umbrella Genre Cohesion: 0.185		 Subgenre Purity Score: 0.135
Triplet-Fixed-30 		Umbrella Genre Cohesion: 0.148		 Subgenre Purity Score: 0.110
Spotify-Cluster-Euc-30 		Umbrella Genre Cohesion: 0.179		 Subgenre Purity Score: 0.141
Random-30 		Umbrella Genre Cohesion: 0.115		 Subgenre Purity Score: 0.072


CNN-Cluster-Cos-252 		Umbrella Genre Cohesion: 0.244		 Subgenre Purity Score: 0.215
Linear-Cluster-Cos-252 		Umbrella Genre Cohesion: 0.276		 Subgenre Purity Score: 0.236
Linear-mtg-only-252 		Umbrella Genre Cohesion: 0.259		 Subgenre Purity Score: 0.219
Classification-252 		Umbrella Genre Cohesion: 0.153		 Subgenre Purity Score: 0.109
Triplet-Fixed-252 		Umbrella Genre Cohesion: 0.161		 Subgenre Purity Score: 0.134
Sp

In [31]:
run_statistics("Triplet-Fixed-30")
genre_counts = filtered_df.groupby(['subgenre', 'Linear-mtg-only-252']).size().unstack(fill_value=0)
genre_counts

Linear-mtg-only-252,1,2,3,4,5,6,7,8,9,10,...,227,228,231,234,240,242,243,245,251,252
subgenre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
acid jazz,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
acid techno,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
acoustic rock,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
afro house,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
album rock,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vietnam indie,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
vinahouse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
visual kei,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
vocaloid,0,0,3,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [31]:
filtered_df[filtered_df["subgenre"] == "zouk"]

Unnamed: 0,name,artist,id,subgenre,features,umbrella_genre,CNN-Cluster-Cos-30,CNN-Cluster-Cos-252,Linear-Cluster-Cos-30,Linear-Cluster-Cos-252,Linear1024-Cluster-Cos-30,Linear1024-Cluster-Cos-252,Spotify-Cluster-Euc-30,Spotify-Cluster-Euc-252,Random-30,Random-252
3229,Believing and Being (feat. ina),hitogoto,3O5xf8zvs03YRSStNRhgfu,zouk,"[173, 57, 61, -9, 13, 55, 117, 9, 20, 32]","Latin, Bossa & Reggae",4,67,13,140,6,121,22,250,26,234
