# Clean History

> Normalize Data

In [None]:
# | default_exp clean_data

In [None]:
# | hide
import pickle
import json
import re
import pandas as pd

from pathlib import Path
from spotify_etl.core import *

In [None]:
# | hide
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

In [None]:
streaming_history = extract_streaming_history(Path("streaming_history"))
clean_history = clean_streaming_history(streaming_history, 0.7)

In [None]:
track_metadata = get_pickle_data(Path("data/track_metadata.pkl"))
artist_metadata = get_pickle_data(Path("data/artist_metadata.pkl"))
album_metadata = get_pickle_data(Path("data/album_metadata.pkl"))
audio_features = get_pickle_data(Path("data/audio_features.pkl"))
raw_track_metadata = get_pickle_data(Path("data/raw_track_metadata.pkl"))
raw_artist_metadata = get_pickle_data(Path("data/raw_artist_metadata.pkl"))

In [None]:
def dict_to_df(dict, index_name=""):
    return pd.DataFrame(dict).T.reset_index(names=[index_name])

In [None]:
track_df = dict_to_df(track_metadata, "track_id")
artist_df = dict_to_df(artist_metadata, "artist_id")
album_df = dict_to_df(album_metadata, "album_id")
raw_track_df = dict_to_df(raw_track_metadata, "album_id")

## Clean Track Data

removing artist data from the track df

In [None]:
# |code-fold: true
track_df = (
    track_df.drop(
        [
            "artist",
            "artist_genres",
            "artist_main_genre",
            "artist_secondary_genre",
            "artist_image",
            "main_artist_url",
            "id",
            "main_artist_uri",
        ],
        axis=1,
    )
    .merge(
        raw_track_df[["duration_ms", "explicit", "popularity", "id"]].rename(
            columns={"id": "track_id"}
        ),
        on="track_id",
        how="left",
    )
    .rename(
        columns={
            "name": "song",
            "artist_id": "main_artist_id",
            "popularity": "song_popularity",
        }
    )
)

Writting function to clean titles (Album & Track)

In [None]:
# | export
# |code-fold: true
def clean_title(title: str):
    # Remove everything in parentheses
    title = re.sub(r"\(.*\)", "", title)
    # Remove everything in brackets
    title = re.sub(r"\[.*\]", "", title)
    # Remove everything after a dash
    title = re.sub(r"^(.*?) -.*$", r"\1", title)
    # Remove non-ASCII characters
    title = re.sub(r"[^\x00-\x7F]+", "", title)
    # Remove after colon
    title = re.sub(r"^(.*?) :.*$", r"\1", title)
    # Strip leading and trailing whitespace
    title = title.strip()

    return title

Converting Album Images to JSON Strings

In [None]:
# |code-fold: true
def categorize_album_images(json_str):
    try:
        images = json.loads(json_str)
        large = medium = small = {}
        for img in images:
            if img["height"] == 640:
                large = img
            elif img["height"] == 300:
                medium = img
            elif img["height"] == 64:
                small = img
        return json.dumps(large), json.dumps(medium), json.dumps(small)
    except json.JSONDecodeError:
        return "{}", "{}", "{}"

In [None]:
track_df["image_lg"], track_df["image_md"], track_df["image_sm"] = zip(
    *track_df.album_images.apply(categorize_album_images)
)

Cleaning Album Dates

In [None]:
# |code-fold: true
def categorize_decade(year):
    if year < 1950:
        return "Oldie"
    else:
        return str(year)[2] + "0s"

In [None]:
track_df["album_release_year"] = track_df.apply(
    lambda row: (
        row["album_release_date"][:4]
        if row["album_release_date_precision"] in ["day", "month", "year"]
        else None
    ),
    axis=1,
).astype(int)

In [None]:
track_df["album_decade"] = (
    track_df["album_release_year"].astype(int).apply(categorize_decade)
)

## Clean Artist Data

In [None]:
artist_df[artist_df.columns[artist_df.columns != "images"]] = artist_df[
    artist_df.columns[artist_df.columns != "images"]
].convert_dtypes()

Convert Artist Images

In [None]:
# |code-fold: true
def categorrize_img_size(json_str):
    try:
        images = json.loads(json_str)
        xl = lg = md = sm = {}
        for img in images:
            if img["height"] > 800:
                xl = img
            elif img["height"] > 525:
                lg = img
            elif img["height"] > 180:
                md = img
            else:
                sm = img
        return json.dumps(xl), json.dumps(lg), json.dumps(md), json.dumps(sm)
    except json.JSONDecodeError:
        return "{}", "{}", "{}", "{}"

In [None]:
(
    artist_df["image_xl"],
    artist_df["image_lg"],
    artist_df["image_md"],
    artist_df["image_sm"],
) = zip(*artist_df.images.apply(categorrize_img_size))

In [None]:
# | hide
# Fix Error where the main / secondary genres were truncated
artist_df.drop(["id"], axis=1, inplace=True)
artist_df[["main_genre", "secondary_genre"]] = artist_df[
    ["main_genre", "secondary_genre"]
].fillna("")

artist_df["main_genre"] = artist_df.apply(
    lambda x: (
        x["genres"].split(";;")[-1] if len(x["main_genre"]
                                           ) < 2 else x["main_genre"]
    ),
    axis=1,
)

artist_df["secondary_genre"] = artist_df.apply(
    lambda x: (
        x["genres"].split(";;")[0]
        if len(x["secondary_genre"]) < 2
        else x["secondary_genre"]
    ),
    axis=1,
)

artist_df.rename(columns={"name": "artist"}, inplace=True)

In [None]:
# | hide
complete_history = (
    clean_history.drop(
        [
            "song",
            "artist",
            "album",
            "episode_name",
            "episode_show_name",
            "spotify_episode_uri",
        ],
        axis=1,
    )
    .merge(track_df, on="track_id", how="left")
    .rename(
        columns={
            "image_lg": "album_img_lg",
            "image_md": "album_img_md",
            "image_sm": "album_img_sm",
        }
    )
    .merge(
        artist_df.rename(
            columns={
                "popularity": "artist_popularity",
                "images": "artist_images",
                "followers": "artist_followers",
                "image_xl": "artist_img_xl",
                "image_lg": "artist_img_lg",
                "image_md": "artist_img_md",
                "image_sm": "artist_img_sm",
            }
        ).drop(["type", "href", "uri", "external_url"], axis=1),
        left_on="main_artist_id",
        right_on="artist_id",
        how="left",
    )
    .drop(["artist_id"], axis=1)[
        [
            "ts",
            "song",
            "album",
            "artist",
            "main_genre",
            "secondary_genre",
            "track_id",
            "main_artist_id",
            "album_id",
            "album_release_date",
            "album_release_date_precision",
            "username",
            "platform",
            "ms_played",
            "conn_country",
            "ip_addr_decrypted",
            "user_agent_decrypted",
            "URI",
            "reason_start",
            "reason_end",
            "shuffle",
            "skipped",
            "offline",
            "offline_timestamp",
            "incognito_mode",
            "month",
            "year",
            "duration",
            "percent_played",
            "album_type",
            "album_uri",
            "album_external_url",
            "album_href",
            "album_images",
            "artist_ids",
            "danceability",
            "energy",
            "key",
            "loudness",
            "mode",
            "speechiness",
            "acousticness",
            "instrumentalness",
            "liveness",
            "valence",
            "tempo",
            "analysis_url",
            "time_signature",
            "duration_ms",
            "explicit",
            "song_popularity",
            "artist_followers",
            "genres",
            "artist_images",
            "artist_popularity",
            "album_release_year",
            "album_decade",
            "album_img_lg",
            "album_img_md",
            "album_img_sm",
            "artist_img_xl",
            "artist_img_lg",
            "artist_img_md",
            "artist_img_sm",
        ]
    ]
)

In [None]:
# | hide
save_pickle_data(complete_history, Path("data/complete_history.pkl"))
save_pickle_data(track_df, Path("data/track_df.pkl"))
save_pickle_data(artist_df, Path("data/artist_df.pkl"))

In [None]:
# | hide

In [None]:
# | hide

In [None]:
# | hide

In [None]:
# | hide