In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Set random seed for everything (except sklearn)
RND_SEED: int = 12345
np.random.seed(RND_SEED) # for numpy, scipy
pd.core.common.random_state(RND_SEED) # for pandas

# Resolution for graph images
WIDTH: int = 1366
HEIGHT: int = 768

In [3]:
df = pd.read_csv("./../../data/Combined-2023.csv", encoding="utf-8", index_col=[0])
df["streams"] = df["streams"].astype(str).apply(lambda x: float(x) / 1e6 if x.isdigit() else np.nan)
df["in_deezer_playlists"] = df["in_deezer_playlists"].astype(str).apply(lambda x: float(x.replace(",", "")) / 1000)
df["in_shazam_charts"] = df["in_shazam_charts"].astype(str).apply(
    lambda x: int(float(x.replace(",", ""))) if x.isdigit() else pd.NA
).astype("Int64")
df["key"] = df["key"].astype("category")
df["mode"] = df["mode"].astype("category")

In [4]:
# Test for track repition
rep_test = df.sort_values(by="track_name")
rep_i: list[int] = []
for i in range(1, rep_test.shape[0]):
    if rep_test.iloc[i]["track_name"] == rep_test.iloc[i - 1]["track_name"]:
        rep_i.append(i - 1)
        rep_i.append(i)
del i
rep_test = rep_test.iloc[rep_i]
del rep_test, rep_i

In [5]:
key_list: list[str] = sorted(df["key"].dropna().unique().tolist())
df = pd.get_dummies(df, columns=["key", "mode"], drop_first=True, dummy_na=True, dtype="boolean[pyarrow]")
df.loc[df["key_nan"], df.columns.str.startswith("key_")] = pd.NA
df.loc[df["mode_nan"], df.columns.str.startswith("mode_")] = pd.NA
df.drop(columns=["key_nan", "mode_nan"], inplace=True)

In [6]:
cols: list[str] = [
    "artist_count", "released_year", "released_month", "released_day", "in_spotify_playlists",
    "in_spotify_charts", "streams", "in_apple_playlists", "in_apple_charts", "in_deezer_playlists",
    "in_deezer_charts", "in_shazam_charts", "bpm", "danceability_%", "valence_%", "energy_%",
    "acousticness_%", "instrumentalness_%", "liveness_%", "speechiness_%",
    *df.loc[:, df.columns.str.startswith("key_")].columns, *df.loc[:, df.columns.str.startswith("mode_")].columns,
]
df = df.loc[:, cols]

In [8]:
corr_mat = df.corr(method="kendall")
fig = px.imshow(corr_mat, text_auto=True)
fig.show()
# Save images
fig.write_image(f"./../../images/2023/Heatmap-SpeCorr.png", width=WIDTH, height=HEIGHT, scale=1.0)