In [1]:
import pandas as pd
import copy
from collections import Counter

CSV_LOAD_PATH = "../dataset/Spotify/spotify_dataset.csv"
CSV_SAVE_PATH = "../dataset/Spotify/spotify_dataset_clean.csv"

# Load original spotify dataset and perform initial cleaning

In [2]:
df = pd.read_csv(CSV_LOAD_PATH)
df.drop(columns=["Unnamed: 0"], inplace=True)

# The guy that made the dataset as added copy rows for different `popularity` and `track_genre`.
# I'm not really that concern about the popularity score, so I will just pick the max `popularity` score when there's more.
# Regarding the `track_genre`, I'll simply combine it to a list instead and deal with one hot encodings are whatever later on
df_cleaned = []
df_count = df.nunique().copy() * 0
for track_id, df_track in df.groupby("track_id"):
    if (len(df_track) == 1) or ((df_track.nunique() > 1).sum() <= 1):
        cleaned_row = copy.deepcopy(df_track.iloc[0])
        cleaned_row["track_genre"] = [cleaned_row["track_genre"]]
    else:
        df_count += (df_track.nunique() > 1).astype(int)
        df_track.sort_values(by=["popularity"], ascending=False)
        cleaned_row = copy.deepcopy(df_track.iloc[0])
        cleaned_row["track_genre"] = pd.unique(df_track["track_genre"]).tolist()
    df_cleaned.append(cleaned_row)
df = pd.DataFrame(df_cleaned)

# Seems unwise to have semicolons in a csv for no good reason
df["artists"] = df["artists"].apply(lambda x: str(x).split(";"))

# I generate the search query by: track_name - artist_1, ... artist_n, Official, music video
df["youtube_search_query"] = df.apply(lambda x: f"{x['track_name']} - {', '.join(x['artists'])}, Official, music video", axis=1)

# There's only a single row with problems, so I'm not going to do anything fancy, will just remove it
df = df[~df.isna().any(axis=1)]

# An explicit track is one that has curse words or language or art that is generally deemed sexual, violent, or offensive.
# TODO: Should I remove this??
df = df[~df["explicit"]]

# TODO: This is probably too aggressive and hence wasteful, but for the time being I'm going to focus solely on the more popular songs
df = df[df["popularity"] > 25]
df = df.sort_values(by=["popularity"], ascending=False)

# Remove music with very uncommon genres
genres = []
for row in df["track_genre"]:
    assert isinstance(row, list)
    genres += row
occurrences = Counter(genres)
unacceptable_keys = [k for k,v in occurrences.items() if (v < 100)]
bad_rows = df["track_genre"].apply(lambda row: any(k in row for k in unacceptable_keys))
df = df[~bad_rows]

# Wrap up
df["original_index"] = df.index
df = df.reset_index(drop=True)
df["youtube_url"] = None
df["youtube_title"] = None
df.to_csv(CSV_SAVE_PATH, index=False)

# Sanity check
df = pd.read_csv(CSV_SAVE_PATH)
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,time_signature,track_genre,youtube_search_query,original_index,youtube_url,youtube_title
0,3nqQXoyQOWXiESFLlDF1hG,"['Sam Smith', 'Kim Petras']",Unholy (feat. Kim Petras),Unholy (feat. Kim Petras),100,156943,False,0.714,0.472,2,...,5e-06,0.266,0.238,131.121,4,['dance'],"Unholy (feat. Kim Petras) - Sam Smith, Kim Pet...",20001,,
1,2tTmW7RDtMQtBk7m2rYeSw,"['Bizarrap', 'Quevedo']","Quevedo: Bzrp Music Sessions, Vol. 52","Quevedo: Bzrp Music Sessions, Vol. 52",99,198937,False,0.621,0.782,2,...,0.033,0.23,0.55,128.033,4,['hip-hop'],"Quevedo: Bzrp Music Sessions, Vol. 52 - Bizarr...",51664,,
2,4h9wh7iOZ0GGn8QVp4RAOB,['OneRepublic'],I Ain’t Worried (Music From The Motion Picture...,I Ain't Worried,96,148485,False,0.704,0.797,0,...,0.000745,0.0546,0.825,139.994,4,['piano'],"I Ain't Worried - OneRepublic, Official, music...",79000,,
3,4LRPiXqCikLlN15c3yImP7,['Harry Styles'],As It Was,As It Was,95,167303,False,0.52,0.731,6,...,0.00101,0.311,0.662,173.93,4,['pop'],"As It Was - Harry Styles, Official, music video",81052,,
4,6xGruZOHLs39ZbVccQTuPZ,['Joji'],Glimpse of Us,Glimpse of Us,94,233456,False,0.44,0.317,8,...,5e-06,0.141,0.268,169.914,3,['pop'],"Glimpse of Us - Joji, Official, music video",81102,,
