# Dataset Setup
Save the following datasets locally as:

charts.csv: https://www.kaggle.com/datasets/dhruvildave/spotify-charts (2017-2021)

tracks_features.csv: https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs

Spotify 2010 - 2019 Top 100.csv: https://www.kaggle.com/datasets/muhmores/spotify-top-100-songs-of-20152019

spotify_dataset.csv: https://www.kaggle.com/datasets/sashankpillai/spotify-top-200-charts-20202021

dataset.csv: https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset

data.csv: https://www.kaggle.com/datasets/vatsalmavani/spotify-dataset

SpotifyFeatures.csv: https://www.kaggle.com/datasets/nandhakumarss/spotify-song-tracks

train.csv and test.csv: https://www.kaggle.com/datasets/elemento/music-albums-popularity-prediction?select=test.csv

TikTok_songs_2019.csv: https://www.kaggle.com/datasets/sveta151/tiktok-popular-songs-2019

TikTok_songs_2020.csv: https://www.kaggle.com/datasets/sveta151/tiktok-popular-songs-2020

TikTok_songs_2021.csv: https://www.kaggle.com/datasets/sveta151/tiktok-popular-songs-2021

In [98]:
# package installs and imports
%pip install pandasql  
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import pandasql as ps

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Joe\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [99]:
# load data into pandas dataframes
charts_df = pd.read_csv('charts.csv')
song_features_1_df = pd.read_csv('tracks_features.csv')
song_features_2_df = pd.read_csv('Spotify 2010 - 2019 Top 100.csv')
song_features_3_df = pd.read_csv('spotify_dataset.csv')
song_features_4_df = pd.read_csv('dataset.csv')
song_features_5_df = pd.read_csv('data.csv')
song_features_6_df = pd.read_csv('SpotifyFeatures.csv')
song_features_7_df = pd.read_csv('train.csv')
song_features_10_df = pd.read_csv('test.csv')
tiktok_19_df = pd.read_csv('TikTok_songs_2019.csv')
tiktok_20_df = pd.read_csv('TikTok_songs_2020.csv')
tiktok_21_df = pd.read_csv('TikTok_songs_2021.csv')

# Song Feature EDA

Exploring the many possible datasets of songs and features. Relevant questions to answer:

1. Can we find song features for most of the songs in 'charts.csv'?

2. Do we have to combine datasets to do so?

3. If we have to combine datasets, what fields do they share?

4. What kind of and how much cleaning do we need to do?

In [100]:
# first, since we want to find song features for as many of the songs in 'charts_df' as possible, let's start by
# projecting and grouping charts_df by song title and artist name
charts_songs_artists_df = charts_df[['title', 'artist']].drop_duplicates()

# next, let's join the features datasets on song title and artist name and see what percentage of songs in charts_df
# each song features dataset can provide features for
song_features_1_df['artists'] = song_features_1_df['artists'].str.replace('[', '').str.replace(']', '').str.replace("'", '')
song_features_1_df_projected = song_features_1_df[['name', 'artists']].rename(columns={'name': 'title', 'artists': 'artist'})
song_features_2_df_projected = song_features_2_df[['title', 'artist']]
song_features_3_df_projected = song_features_3_df[['Song Name', 'Artist']].rename(columns={'Song Name': 'title', 'Artist': 'artist'})
song_features_4_df_projected = song_features_4_df[['track_name', 'artists']].rename(columns={'track_name': 'title', 'artists': 'artist'})
song_features_5_df['artists'] = song_features_5_df['artists'].str.replace('[', '').str.replace(']', '').str.replace("'", '')
song_features_5_df_projected = song_features_5_df[['name', 'artists']].rename(columns={'name': 'title', 'artists': 'artist'})
song_features_6_df_projected = song_features_6_df[['track_name', 'artist_name']].rename(columns={'track_name': 'title', 'artist_name': 'artist'})
song_features_7_df['artists'] = song_features_7_df['artists'].str.replace(', ', '')  # contains three songs per album, so build 7 8 and 9 below
song_features_7_df_projected = song_features_7_df[['t_name0', 'artists']].rename(columns={'t_name0': 'title', 'artists': 'artist'})
song_features_8_df_projected = song_features_7_df[['t_name1', 'artists']].rename(columns={'t_name1': 'title', 'artists': 'artist'})
song_features_9_df_projected = song_features_7_df[['t_name2', 'artists']].rename(columns={'t_name2': 'title', 'artists': 'artist'})
song_features_10_df['artists'] = song_features_10_df['artists'].str.replace(', ', '')  # contains three songs per album, so build 10 11 and 12 below
song_features_10_df_projected = song_features_10_df[['t_name0', 'artists']].rename(columns={'t_name0': 'title', 'artists': 'artist'})
song_features_11_df_projected = song_features_10_df[['t_name1', 'artists']].rename(columns={'t_name1': 'title', 'artists': 'artist'})
song_features_12_df_projected = song_features_10_df[['t_name2', 'artists']].rename(columns={'t_name2': 'title', 'artists': 'artist'})

# join song features datasets on song title and artist name
merge_1 = pd.merge(charts_songs_artists_df, song_features_1_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_2 = pd.merge(charts_songs_artists_df, song_features_2_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_3 = pd.merge(charts_songs_artists_df, song_features_3_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_4 = pd.merge(charts_songs_artists_df, song_features_4_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_5 = pd.merge(charts_songs_artists_df, song_features_5_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_6 = pd.merge(charts_songs_artists_df, song_features_6_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_7 = pd.merge(charts_songs_artists_df, song_features_7_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_8 = pd.merge(charts_songs_artists_df, song_features_8_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_9 = pd.merge(charts_songs_artists_df, song_features_9_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_10 = pd.merge(charts_songs_artists_df, song_features_10_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_11 = pd.merge(charts_songs_artists_df, song_features_11_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()
merge_12 = pd.merge(charts_songs_artists_df, song_features_12_df_projected, on=['title', 'artist'], how='inner').drop_duplicates()


match_percentage_1 = 100 * merge_1.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_2 = 100 * merge_2.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_3 = 100 * merge_3.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_4 = 100 * merge_4.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_5 = 100 * merge_5.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_6 = 100 * merge_6.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_7 = 100 * merge_7.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_8 = 100 * merge_8.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_9 = 100 * merge_9.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_10 = 100 * merge_10.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_11 = 100 * merge_11.shape[0] / charts_songs_artists_df.shape[0]
match_percentage_12 = 100 * merge_12.shape[0] / charts_songs_artists_df.shape[0]

print("song_features_1_df match percentage: " + str(round(match_percentage_1,2)))
print("song_features_2_df match percentage: " + str(round(match_percentage_2,2)))
print("song_features_3_df match percentage: " + str(round(match_percentage_3,2)))
print("song_features_4_df match percentage: " + str(round(match_percentage_4,2)))
print("song_features_5_df match percentage: " + str(round(match_percentage_5,2)))
print("song_features_6_df match percentage: " + str(round(match_percentage_6,2)))
print("song_features_7_df match percentage: " + str(round(match_percentage_7,2)))
print("song_features_8_df match percentage: " + str(round(match_percentage_8,2)))
print("song_features_9_df match percentage: " + str(round(match_percentage_9,2)))
print("song_features_10_df match percentage: " + str(round(match_percentage_10,2)))
print("song_features_11_df match percentage: " + str(round(match_percentage_11,2)))
print("song_features_12_df match percentage: " + str(round(match_percentage_12,2)))

combined_features_df = pd.concat([merge_1, merge_2, merge_3, merge_4, merge_5, merge_6, merge_7, merge_8, merge_9, merge_10, merge_11, merge_12], ignore_index=True).drop_duplicates()
match_percentage_combined = 100 * combined_features_df.shape[0] / charts_songs_artists_df.shape[0]
print("combined features match percentage: " + str(round(match_percentage_combined,2)))
print("for a total of " + str(combined_features_df.shape[0]) + " out of " + str(charts_songs_artists_df.shape[0]) + " songs")

song_features_1_df match percentage: 5.41
song_features_2_df match percentage: 0.3
song_features_3_df match percentage: 0.79
song_features_4_df match percentage: 3.65
song_features_5_df match percentage: 6.25
song_features_6_df match percentage: 7.14
song_features_7_df match percentage: 2.7
song_features_8_df match percentage: 1.26
song_features_9_df match percentage: 1.08
song_features_10_df match percentage: 0.87
song_features_11_df match percentage: 0.52
song_features_12_df match percentage: 0.46
combined features match percentage: 17.58
for a total of 34720 out of 197535 songs
