In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Set random seed for everything (except sklearn)
RND_SEED: int = 12345
np.random.seed(RND_SEED) # for numpy, scipy
pd.core.common.random_state(RND_SEED) # for pandas

# Resolution for graph images
WIDTH: int = 1366
HEIGHT: int = 768

In [3]:
df = pd.read_csv("./../../data/Combined-2024.csv", encoding="utf-8", index_col=[0])
df["Spotify Streams"] = df["Spotify Streams"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1e6 if x.replace(",", "").isdigit() else np.nan
)
df["Spotify Playlist Count"] = df["Spotify Playlist Count"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1000 if x.replace(",", "").isdigit() else np.nan
)
df["Spotify Playlist Reach"] = df["Spotify Playlist Reach"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1e6 if x.replace(",", "").isdigit() else np.nan
)
df["YouTube Views"] = df["YouTube Views"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1e6 if x.replace(",", "").isdigit() else np.nan
)
df["YouTube Likes"] = df["YouTube Likes"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1000 if x.replace(",", "").isdigit() else np.nan
)
df["YouTube Playlist Reach"] = df["YouTube Playlist Reach"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1e6 if x.replace(",", "").isdigit() else np.nan
)
df["TikTok Posts"] = df["TikTok Posts"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1000 if x.replace(",", "").isdigit() else np.nan
)
df["TikTok Likes"] = df["TikTok Likes"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1e6 if x.replace(",", "").isdigit() else np.nan
)
df["TikTok Views"] = df["TikTok Views"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1e9 if x.replace(",", "").isdigit() else np.nan
)
df["AirPlay Spins"] = df["AirPlay Spins"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1000 if x.replace(",", "").isdigit() else np.nan
)
df["SiriusXM Spins"] = df["SiriusXM Spins"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1000 if x.replace(",", "").isdigit() else np.nan
)
df["Deezer Playlist Reach"] = df["Deezer Playlist Reach"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1e6 if x.replace(",", "").isdigit() else np.nan
)
df["Pandora Streams"] = df["Pandora Streams"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1e6 if x.replace(",", "").isdigit() else np.nan
)
df["Pandora Track Stations"] = df["Pandora Track Stations"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1000 if x.replace(",", "").isdigit() else np.nan
)
df["Soundcloud Streams"] = df["Soundcloud Streams"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1e6 if x.replace(",", "").isdigit() else np.nan
)
df["Shazam Counts"] = df["Shazam Counts"].astype(str).apply(
    lambda x: float(x.replace(",", "")) / 1e6 if x.replace(",", "").isdigit() else np.nan
)

test = df["Shazam Counts"].dropna()
# SpoStreams (10**6), SpoPlayCount (10**3), SpoPlayReach (10**6), YouView (10**6), YouLike (10**3), YouPlayReach (10**6),
# TikPost (10**3), TikLike (10**6), TikView (10**9), AirSpin (10**3), SiriSpin (10**3), DeePlayReach (10**6),
# PanStreams (10**6), PanStation (10**3), SoundStreams (10**6), ShaCount (10**6)

In [4]:
# Test for track repition
# rep_test = df.sort_values(by="track_name")
# rep_i: list[int] = []
# for i in range(1, rep_test.shape[0]):
#     if rep_test.iloc[i]["track_name"] == rep_test.iloc[i - 1]["track_name"]:
#         rep_i.append(i - 1)
#         rep_i.append(i)
# del i
# rep_test = rep_test.iloc[rep_i]
# del rep_test, rep_i

In [5]:
# key_list: list[str] = sorted(df["key"].dropna().unique().tolist())
# df = pd.get_dummies(df, columns=["key", "mode"], dummy_na=True, dtype="boolean[pyarrow]")
# df.loc[df["key_nan"], df.columns.str.startswith("key_")] = pd.NA
# df.loc[df["mode_nan"], df.columns.str.startswith("mode_")] = pd.NA
# df.drop(columns=["key_nan", "mode_nan"], inplace=True)

In [11]:
# not_include: set[str] = set(["Track", "Album Name", "Artist", "Release Date", "ISRC", "All Time Rank", "TIDAL Popularity"])
cols: list[str] = [
    "Spotify Streams", "SiriusXM Spins", "Pandora Streams", "AirPlay Spins"
]
# cols = list(filter(lambda x: x not in not_include, df.columns))
corr_mat = df.loc[:, cols].corr(method="kendall")
fig = px.imshow(corr_mat, text_auto=True)
fig.update_layout(dict(
    title=dict(
        text="Kendall Correlation of 2024 Features: Radio Platforms",
        font=dict(
            size=24
        ),
    ),
))
fig.show()
# Save images
fig.write_image(f"./../../images/2024/Correlation-Radio.png", width=WIDTH, height=HEIGHT, scale=1.0)

In [7]:
cols: list[str] = [
    "streams", "artist_count", "released_year", "released_month", "released_day", "in_spotify_playlists",
    "in_spotify_charts", "in_apple_playlists", "in_apple_charts", "in_deezer_playlists",
    "in_deezer_charts", "in_shazam_charts"
]
corr_mat = df.loc[:, cols].corr(method="kendall")
fig = px.imshow(corr_mat, text_auto=True)
fig.update_layout(dict(
    title=dict(
        text="Kendall Correlation of 2023 Features: Playlists, Charts",
        font=dict(
            size=24
        ),
    ),
))
fig.show()
# Save images
fig.write_image(f"./../../images/2023/Correlation-Playlist.png", width=WIDTH, height=HEIGHT, scale=1.0)

In [8]:
cols: list[str] = [
    "streams", "artist_count", "released_year", "released_month", "released_day",
    "bpm", "danceability_%", "valence_%", "energy_%",
    "acousticness_%", "instrumentalness_%", "liveness_%", "speechiness_%",
    *df.loc[:, df.columns.str.startswith("key_")].columns, *df.loc[:, df.columns.str.startswith("mode_")].columns,
]
corr_mat = df.loc[:, cols].corr(method="kendall")
fig = px.imshow(corr_mat, text_auto=True)
fig.update_layout(dict(
    title=dict(
        text="Kendall Correlation of 2023 Features: Track Properties",
        font=dict(
            size=24
        ),
    ),
))
fig.show()
# Save images
fig.write_image(f"./../../images/2023/Correlation-Properties.png", width=WIDTH, height=HEIGHT, scale=1.0)