# Loading datasets into Dataframes

The first step is to always load the dataframes we have 

Making necessary imports

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

Loading the necessary dataframes

In [7]:
# Dataset paths
dataset_name = "music4all"
dataset_relative_location = "../../../dataset"
dataset_path = Path(dataset_relative_location) / dataset_name

# Get all CSVs in folder
csvsPattern = "*.csv"
csvs = list(dataset_path.glob(csvsPattern))
csvs

datasets = {}
# Load each one into a dataframe
for csv in csvs:
    datasets[csv.stem] = pd.read_csv(csv, delimiter="\t")

After loading all the .csvs provided by the Music4All dataset we can start working with them, as each contains some information about the songs in them.

Let's a look at the datasets we have:

In [8]:
datasets.keys()

dict_keys(['id_tags', 'id_metadata', 'id_information', 'listening_history', 'id_lang', 'id_genres'])

### Merging the datasets together can provide a lot of meaningful information

In [9]:
song_info = pd.merge(datasets['id_information'], datasets['id_genres'], on='id')
song_info = pd.merge(song_info, datasets['id_metadata'], on='id') # has release date info
song_info.head()

Unnamed: 0,id,artist,song,album_name,genres,spotify_id,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
0,0009fFIM1eYThaPg,Cheryl,Rain on Me,3 Words,pop,3eObKIfHKJ1nAPh0wTxFCc,12.0,2009,0.635,0.746,6.0,1.0,0.548,110.973,229947
1,0010xmHR6UICBOYT,Oddisee,After Thoughts,The Beauty in All,underground hip hop,27szvF97Tu95GxN98N52fy,46.0,2013,0.591,0.513,7.0,0.0,0.263,172.208,325096
2,002Jyd0vN4HyCpqL,Blue Öyster Cult,ME 262,Secret Treaties,"hard rock,rock,classic rock",273lBFpxUCwisTpdnF9cVb,31.0,1974,0.319,0.925,2.0,1.0,0.658,157.63,285693
3,006TYKNjNxWjfKjy,Rhapsody,Flames of Revenge,Legendary Years (Re-Recorded),"symphonic metal,power metal,symphonic power metal",1qZgergQ41vaD4zBf3AKXR,33.0,2017,0.432,0.979,7.0,1.0,0.162,90.008,332867
4,007LIJOPQ4Sb98qV,The Chameleons,Nostalgia,What Does Anything Mean? Basically (2009 Remas...,"post-punk,new wave",6rVxJ3sN3Cz40MSLavbG1K,19.0,2009,0.357,0.708,9.0,1.0,0.47,123.904,326067


Let's check if there is any weird value in it

In [10]:
song_info.describe()

Unnamed: 0,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
count,109269.0,109269.0,109269.0,109269.0,109269.0,109269.0,109269.0,109269.0,109269.0
mean,35.080608,2005.813488,0.520449,0.667162,5.284079,0.624669,0.445504,122.753032,242504.6
std,14.756258,14.335056,0.173008,0.241372,3.560797,0.48421,0.252159,28.997936,100336.0
min,0.0,1013.0,0.0,0.0,0.0,0.0,0.0,0.0,7229.0
25%,25.0,2001.0,0.403,0.502,2.0,0.0,0.236,100.194,193481.0
50%,34.0,2011.0,0.528,0.712,5.0,1.0,0.424,121.079,227533.0
75%,45.0,2016.0,0.645,0.872,9.0,1.0,0.639,140.047,271533.0
max,95.0,2019.0,0.988,1.0,11.0,1.0,0.998,242.903,4995315.0


The amount of songs seems to match, but wait, the oldest song in the dataset is not from "1013", actually all songs with release date <= 1900 have the wrong release date

In [11]:
song_info.query("release <= 1900")

Unnamed: 0,id,artist,song,album_name,genres,spotify_id,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
10350,5pClePd8cdUOGQe2,Los Espíritus,Lo Echaron del Bar,Los Espíritus,"latin,rock,psychedelic rock,latin,rock,psyched...",45LCjFwH5veyV1twkPFKVO,35.0,1013,0.595,0.761,7.0,0.0,0.868,166.503,268507
13762,7oBqYSOUt2PehcJd,Los Espíritus,Jesús rima con cruz,Los Espíritus,"latin,rock,psychedelic rock,latin,rock,psyched...",0ytnXiwu42kNYnBvS48nc5,27.0,1013,0.443,0.688,7.0,1.0,0.717,95.729,296680
62143,ZEuwJ4wJH5GW6ay9,Bukka White,Parchman Farm Blues,Good Gin Blues,"blues,delta blues",09sKCJcVUC24vgpaamdkrj,30.0,1899,0.548,0.335,9.0,0.0,0.656,98.688,162067
83323,lHUghlsxSApSGZ4i,Sidney Polak,Otwieram Wino - feat. Pezet,(Untitled),"reggae,rap,reggae",39Cw7Nac50scksR3vMREQy,3.0,1900,0.721,0.737,8.0,1.0,0.878,99.091,238573
83679,lUJHN9keOtaNGCYK,Los Espíritus,noches de verano,Los Espíritus,"latin,rock,psychedelic rock",5sFEbn2q1dI0xO06BKkcH1,40.0,1013,0.639,0.512,8.0,0.0,0.598,127.451,299880


In [12]:
# Fix wrong release data for some songs

# Los Espiritus album was released in 2013
song_info['release'].replace(1013, 2013, inplace=True)

# Bukka White - Parchman Farm Blues was released in 1940
song_info['release'].replace(1899, 1940, inplace=True)

# Sidney Polak	Otwieram Wino - feat. Pezet according to amazon was released in 2018
song_info['release'].replace(1900, 2018, inplace=True)

## Removing empty MP3 Songs

The details of how they are identified are in identifying_empty_songs.py script

In [14]:
# Loading empty songs list 
with open("empty_mp3_ids.txt", "r") as file:
    empty_songs = list(file.read().split("\n"))

cleaned_song_info = song_info[~song_info.id.isin(empty_songs)]


In [18]:
print(len(song_info))
print(len(cleaned_song_info))

109269
109178


## We can now export the cleaned dataset

We will often use a partitioned dataset for computational reasons the dataset is too big to work with

In [19]:
df_sizes_to_work_with = [20000, 30000]
for size in df_sizes_to_work_with:
    partitioned_df = cleaned_song_info[:size]
    partitioned_df.to_csv(f"../dataset/cleaned_datasets/cleaned_song_info_cleaned_{len(partitioned_df)}_entries.csv")    

cleaned_song_info.to_csv(f"../dataset/cleaned_datasets/cleaned_song_info_cleaned_{len(cleaned_song_info)}_entries.csv")