# Data Exploration

In [1]:
import os, sys

target_folder = 'MMSR25-26-Group-E'
current_path = os.getcwd()

while os.path.basename(current_path) != target_folder:
    if os.path.basename(current_path) == 'RetrievalAlgorithm':
        if os.path.join(current_path) not in sys.path:
            sys.path.append(os.path.join(current_path))
    parent = os.path.dirname(current_path)
    os.chdir(parent)
    current_path = parent

from src.utils.data_loading import load_all_tsv_files_from_path

## Load the Data

In [2]:
dataset_files_dict = load_all_tsv_files_from_path(path_to_dataset='Dataset')

Loading .tsv files: 100%|██████████| 9/9 [00:00<00:00, 441.04it/s]


In [3]:
print('Loaded .tsv files:')
for i, file_name in enumerate(dataset_files_dict.keys(), start=1):
    print(f'{i}. {file_name}')

Loaded .tsv files:
1. id_gems_mmsr.tsv
2. id_genres_mmsr.tsv
3. id_information_mmsr.tsv
4. id_lyrics_bert_mmsr.tsv
5. id_metadata_mmsr.tsv
6. id_mfcc_bow_mmsr.tsv
7. id_total_listens.tsv
8. id_url_mmsr.tsv
9. id_vgg19_mmsr.tsv


## Exploratory Data Analysis (for each file)

__Total tracks/songs count:__ 4148

### `id_gems_mmsr.tsv`

Columns:
* `id`: the internal id of the track/song
* ....

In [4]:
dataset_files_dict['id_gems_mmsr.tsv'].describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0
mean,-0.183088,0.041282,-0.190686,-0.291289,-0.192677,-0.294513,0.150429,0.095949,0.526503
std,0.578547,0.3384,0.781575,0.686354,0.731775,0.664177,0.58793,0.726573,0.988329
min,-2.190505,-0.649236,-2.120379,-2.31632,-2.14632,-2.535721,-1.644288,-1.022236,-1.250182
25%,-0.515224,-0.178601,-0.724413,-0.75307,-0.711435,-0.748105,-0.269378,-0.420787,-0.245645
50%,-0.156742,-0.04386,-0.301585,-0.236021,-0.289462,-0.35259,0.194037,-0.145968,0.278756
75%,0.159952,0.168171,0.349182,0.217827,0.301234,0.126575,0.601988,0.43565,1.071066
max,1.805748,1.922737,2.860255,1.860175,2.582894,2.456159,1.958446,3.552761,4.310461


### `id_genres_mmsr.tsv`

This file contains the information about what genre/s are can be associated with each track. The data is represented as lists and is a labeling problem i.e. one track can have multiple genres assigned to it.


| Column | Type        | Description                                                                      |
|--------|-------------|----------------------------------------------------------------------------------|
| `id`   | `str`       | (internal) id for each track                                                     |
| `genre`| `List[str]` | List of genres associated with the track (a track can belong to multiple genres) |

In [5]:
dataset_files_dict['id_genres_mmsr.tsv']

Unnamed: 0,id,genre
0,01rMxQv6vhyE1oQX,"['rock', 'pop punk']"
1,02ZnlCGZEbkfCDxo,"['pop', 'italian pop', 'latin', 'europop', 'am..."
2,04OjszRi9rC5BlHC,"['experimental', 'folk', 'lo fi', 'freak folk'..."
3,04iitW3ffa0mhpx3,"['pop', 'r b', 'hip hop', 'soul', 'rhythm and ..."
4,04xUDjAYC14jsHyH,"['punk', 'emo', 'post hardcore', 'post punk', ..."
...,...,...
4143,zxYtSeZzEVgPczJz,"['rock', 'grunge', 'alternative rock', 'countr..."
4144,zxlnGZoud2KCmSaw,"['rock', 'indie rock', 'singer songwriter']"
4145,zyzILCQvVeUFIINi,"['rock', 'pop', 'indie rock', 'folk rock', 'ne..."
4146,zzpkRCGA5ud8q4mv,"['soul', 'blues', 'r b', 'blues rock', 'southe..."


### `id_information_mmsr.tsv`

| Column       | Type | Description                                               |
|--------------|------|-----------------------------------------------------------|
| `id`         | `str`  | (internal) id for each track                              |
| `artist`     | `str`  | Name of the artist/band/singer associated with this song. |
| `song`       | `str`  | Name of the song.                                         |
| `album_name` | `str`  | Name of the album associated with this song.              |

In [6]:
dataset_files_dict['id_information_mmsr.tsv']

Unnamed: 0,id,artist,song,album_name
0,01rMxQv6vhyE1oQX,Against the Current,Chasing Ghosts,In Our Bones
1,02ZnlCGZEbkfCDxo,Laura Pausini,Tra Te E Il Mare,The Best of Laura Pausini - E Ritorno Da Te
2,04OjszRi9rC5BlHC,Grizzly Bear,Knife,Yellow House
3,04iitW3ffa0mhpx3,Ne-Yo,Miss Independent,Year Of The Gentleman (Bonus Track Edition)
4,04xUDjAYC14jsHyH,Jawbreaker,Jinx Removing,24 Hour Revenge Therapy (Remastered)
...,...,...,...,...
4143,zxYtSeZzEVgPczJz,Meat Puppets,Climbing,II
4144,zxlnGZoud2KCmSaw,of Montreal,Gelid Ascent,Paralytic Stalks
4145,zyzILCQvVeUFIINi,Crowded House,When You Come,Temple Of Low Men
4146,zzpkRCGA5ud8q4mv,Otis Redding,Rock Me Baby,Otis Blue


In [7]:
dataset_files_dict['id_information_mmsr.tsv'].describe()

Unnamed: 0,id,artist,song,album_name
count,4148,4148,4148,4148
unique,4148,2256,3997,3420
top,01rMxQv6vhyE1oQX,Eminem,Home,Unleashed
freq,1,16,5,6


### `id_lyrics_bert_mmsr.tsv`

| Column  | Type            | Description                                                         |
|---------|-----------------|---------------------------------------------------------------------|
| `id`    | `str`             | (internal) id for each track                                        |
| `0-767` | `numpy.float64` | The 768-dimentional embedding of the audio lyrics produced by BERT. |

In [8]:
dataset_files_dict['id_lyrics_bert_mmsr.tsv'].describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
count,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,...,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0
mean,0.018854,0.029054,-0.004259,-0.026782,-0.010954,0.008515,-0.03056,-0.012941,-0.014804,-0.001822,...,-0.021756,-0.004786,0.022499,0.005203,-0.00773,-0.000393,-0.019284,0.007258,-0.01378,-0.010899
std,0.020318,0.025392,0.011788,0.020622,0.019137,0.015924,0.024441,0.016438,0.022237,0.016855,...,0.024287,0.0194,0.024676,0.016812,0.01227,0.024448,0.017031,0.017172,0.021638,0.012994
min,-0.092596,-0.059589,-0.056007,-0.112804,-0.084138,-0.064109,-0.135089,-0.079608,-0.121818,-0.073961,...,-0.105809,-0.089666,-0.089188,-0.063072,-0.063777,-0.114975,-0.092711,-0.080904,-0.095451,-0.062506
25%,0.005592,0.012449,-0.011908,-0.040958,-0.023045,-0.002131,-0.04681,-0.023429,-0.028466,-0.012938,...,-0.037891,-0.017427,0.00663,-0.005784,-0.01533,-0.015796,-0.029942,-0.003213,-0.027945,-0.019214
50%,0.01906,0.028274,-0.004785,-0.027586,-0.010887,0.008333,-0.030891,-0.012844,-0.014738,-0.001758,...,-0.022337,-0.005099,0.022902,0.005284,-0.007562,-0.001182,-0.018712,0.007345,-0.01419,-0.010914
75%,0.032206,0.045151,0.002892,-0.014019,0.001772,0.018938,-0.014327,-0.002418,-0.000909,0.00892,...,-0.005534,0.008199,0.039129,0.016002,0.000371,0.015526,-0.008425,0.018384,9.4e-05,-0.002701
max,0.107385,0.140573,0.053531,0.067351,0.075186,0.077539,0.076872,0.054285,0.085369,0.073629,...,0.073795,0.06401,0.118076,0.068651,0.054389,0.123002,0.044662,0.068755,0.072547,0.051369


### `id_metadata_mmsr.tsv`

This file contains metadata for the given tracks collected by the Spotify API. Note that some features like `popularity` are defined by Spotify and can not be compared with other datasets/sources.

| Column         | Type                                  | Description                                                                                                              |
|----------------|---------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
| `id`           | `str`                                 | (internal) id for each track                                                                                             |
| `spotify_id'   | `str`                                 | The Spotify id for each track (external).                                                                                |
| `popularity'   | `numpy.float64`                       | (Spotify) popularity score from (0-100).                                                                                 |
| `release'      | `numpy.int64`                         | The year this track was released.                                                                                        |
| `danceability' | `numpy.float64`                       | rhythmic stability, beat clarity, ratio from (0-1).                                                                      |
| `energy'       | `numpy.float64`                       | Measures intensity/power, ratio from (0-1).                                                                              |
| `key'          | `numpy.float64` but in reality `int`  | Misical key (0-11).                                                                                                      |
| `mode'         | `numpy.float64` but in reality `bool` | 1 = major (brighter, happier, or more open harmonic character) <br/> 0 = minor (darker, sadder, or more tense harmonic). |
| `valence'      | `numpy.float64`                       | musical positivity, ratio from (0-1).                                                                                    |
| `tempo'        | `numpy.float64`                       | (Estimated) BPS                                                                                                          |
| `duration_ms'   | `numpy.int64`                         | Duration of the song measured in milliseconds.                                                                           |

In [9]:
dataset_files_dict['id_metadata_mmsr.tsv'].describe()

Unnamed: 0,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
count,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0
mean,35.358004,2004.619335,0.516816,0.701718,5.235535,0.624397,0.449623,124.076839,242972.5
std,14.495564,12.841111,0.166797,0.230394,3.552591,0.484337,0.245873,28.446496,82553.51
min,0.0,1937.0,0.0647,0.00979,0.0,0.0,1e-05,48.359,31493.0
25%,25.0,1999.0,0.40675,0.544,2.0,0.0,0.248,101.99725,198156.8
50%,35.0,2009.0,0.519,0.755,5.0,1.0,0.427,122.0315,230467.0
75%,45.0,2014.0,0.638,0.898,8.0,1.0,0.639,142.4425,271080.0
max,88.0,2019.0,0.971,0.999,11.0,1.0,0.978,217.489,1512685.0


### `id_mfcc_bow_mmsr.tsv`

This file contains the Bag-of-Words vectors of MFCC audio features where each column represents the frequency with which a track’s MFCC frames fall into a specific timbre cluster.

| Column                  | Type            | Description                                                                                                                        |
|-------------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------|
| `id`                    | `str`           | (internal) id for each track                                                                                                       |
| `mfccB000` to `mfccB499` | `numpy.float64` | Contains the BoW histogram values of how frequently the track`s MFCC audio frames fall into each of the 500 learned clusters |

In [10]:
dataset_files_dict['id_mfcc_bow_mmsr.tsv'].describe()

Unnamed: 0,mfccB000,mfccB001,mfccB002,mfccB003,mfccB004,mfccB005,mfccB006,mfccB007,mfccB008,mfccB009,...,mfccB490,mfccB491,mfccB492,mfccB493,mfccB494,mfccB495,mfccB496,mfccB497,mfccB498,mfccB499
count,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,...,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0
mean,1.091092,0.006049,1.94514,0.044787,0.988864,0.019205,2.287352,2.03122,1.506329,1.125536,...,1.528315,1.70081,0.481419,0.850855,1.29965,0.652143,0.44422,1.292157,1.780631,2.032687
std,0.678043,0.065026,0.559459,0.203699,0.851181,0.178159,0.597987,0.539677,0.600723,0.636106,...,0.627317,0.708574,0.615657,0.715199,0.681564,0.689323,0.605046,0.623523,0.557868,0.613745
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.477121,0.0,1.70757,0.0,0.0,0.0,2.089905,1.755875,1.176091,0.69897,...,1.176091,1.322219,0.0,0.0,0.845098,0.0,0.0,0.90309,1.477121,1.748188
50%,1.176091,0.0,2.053078,0.0,0.954242,0.0,2.431364,2.11059,1.591065,1.176091,...,1.623249,1.845098,0.30103,0.845098,1.342423,0.477121,0.0,1.342423,1.851258,2.155336
75%,1.623249,0.0,2.322219,0.0,1.70757,0.0,2.678746,2.396199,1.934498,1.60206,...,1.986772,2.217484,0.845098,1.39794,1.812913,1.176091,0.778151,1.748188,2.158362,2.457882
max,2.995635,1.579784,3.167613,2.394452,3.278067,3.476976,3.306639,3.281261,3.12156,2.984077,...,3.252853,3.153205,2.882525,2.862131,3.12156,3.02735,2.766413,3.190051,3.25358,3.182415


### `id_total_listens.tsv`

This file contains the total number of times each track was listened.

| Column          | Type          | Description                                    |
|-----------------|---------------|------------------------------------------------|
| `id`            | `str`         | (internal) id for each track                   |
| `total_listens` | `numpy.int64` | Total number of times this track was listened. |

In [11]:
dataset_files_dict['id_total_listens.tsv'].describe()

Unnamed: 0,total_listens
count,4148.0
mean,5932.74783
std,9914.91215
min,55.0
25%,1175.0
50%,2847.5
75%,6670.0
max,156462.0


### `id_url_mmsr.tsv`

This file contains links to the YouTube video for each track.

| Column | Type  | Description                                         |
|--------|-------|-----------------------------------------------------|
| `id`   | `str` | (internal) id for each track                        |
| `url`  | `str` | Link to the YouTube video for the respective track. |

In [12]:
dataset_files_dict['id_url_mmsr.tsv']

Unnamed: 0,id,url
0,NDroPROgWm3jBxjH,https://www.youtube.com/watch?v=gPm2s6JORc4
1,y8wp2cUBzIEYsouc,https://www.youtube.com/watch?v=RYzQvj3icjs
2,pAzEb1oXeG9TYIvM,https://www.youtube.com/watch?v=juQ2rtxKzZk
3,XcDu72gipo0aCTDp,https://www.youtube.com/watch?v=o0JlWub9YGg
4,V9HVYlU2kekOQ4Bk,https://www.youtube.com/watch?v=99AlXMtTIBw
...,...,...
4143,tIpLPRZtXWBHINDG,https://www.youtube.com/watch?v=Io3PXcLPMHE
4144,xf9WfXxEpkwo13Sa,https://www.youtube.com/watch?v=df9_tLXj-jc
4145,UIRIKK5BttiXm5C1,https://www.youtube.com/watch?v=5-0kZ_jeIHI
4146,iuC7ksd9ZI6rBDQx,https://www.youtube.com/watch?v=broClUaE4rw


### `id_vgg19_mmsr.tsv`

This file contains the video embeddings for each track. These embeddings are the output of the `VGG-19` convolutional network model.

| Column                   | Type            | Description                                                            |
|--------------------------|-----------------|------------------------------------------------------------------------|
| `id`                     | `str`           | (internal) id for each track                                           |
| `max0000` to `max4095`   | `numpy.float64` | Contains the max activation for feature dimension i across all frames  |
| `mean0000` to `mean4095` | `numpy.float64` | Contains the mean activation for feature dimension i across all frames |

In [13]:
dataset_files_dict['id_vgg19_mmsr.tsv'].describe()

Unnamed: 0,max0000,max0001,max0002,max0003,max0004,max0005,max0006,max0007,max0008,max0009,...,mean4086,mean4087,mean4088,mean4089,mean4090,mean4091,mean4092,mean4093,mean4094,mean4095
count,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,...,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0,4148.0
mean,0.984247,1.355592,0.471212,0.345295,1.944757,1.200065,1.213776,0.302253,1.35509,0.881081,...,0.362462,0.524479,3.565113,1.404776,0.797503,0.338751,0.231156,0.378103,0.554153,1.019901
std,1.024421,1.154833,0.69633,0.554702,1.578303,1.262514,1.108438,0.649426,1.335995,1.247093,...,0.570139,0.673991,2.052579,1.092296,0.868021,0.666918,0.436585,0.658942,0.718185,0.90162
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.287986,0.0,0.0,0.416109,0.0,0.157507,0.0,0.0,0.0,...,0.0,0.0,2.186022,0.616399,0.018059,0.0,0.0,0.0,0.005123,0.149158
50%,0.741177,1.260546,0.044798,0.077674,1.845449,0.854244,1.041168,0.0,1.09505,0.278537,...,0.127176,0.297754,3.579961,1.201407,0.604713,0.045329,0.010211,0.091632,0.290369,0.952532
75%,1.77734,2.104452,0.79376,0.52152,3.07127,2.149559,1.988018,0.292139,2.362311,1.400979,...,0.515229,0.790002,4.782521,2.041133,1.220884,0.337315,0.293,0.514533,0.825506,1.542169
max,7.54144,7.794068,5.666611,6.947582,10.297618,8.568845,7.469369,6.177575,7.170806,11.782614,...,7.237985,6.330817,14.477417,7.026731,7.697643,5.518033,5.767654,8.112298,5.566869,6.154485
