In [1]:
import numpy as np
import pandas as pd
import scipy.spatial as spatial

In [2]:
RND_SEED: int = 12345
np.random.seed(RND_SEED)
pd.core.common.random_state(RND_SEED)

# Resolution for graph images
WIDTH: int = 1366
HEIGHT: int = 768

In [3]:
df = pd.read_csv("./../../data/Combined-2023.csv", encoding="utf-8", index_col=[0])
df["streams"] = df["streams"].astype(str).apply(lambda x: float(x) / 1e6 if x.isdigit() else np.nan)
df["in_spotify_playlists"] = df["in_spotify_playlists"].apply(lambda x: x / 1000)
df["in_deezer_playlists"] = df["in_deezer_playlists"].astype(str).apply(lambda x: float(x.replace(",", "")) / 1000)
df["in_shazam_charts"] = df["in_shazam_charts"].astype(str).apply(
    lambda x: int(float(x.replace(",", ""))) if x.isdigit() else pd.NA
).astype("Int64")
df["key"] = df["key"].astype("category")
df["mode"] = df["mode"].astype("category")
emb_df = pd.read_csv("./../../data/WordEmb-Name-2023.csv", encoding="utf-8")
word_df = pd.read_csv("./../../data/WordList.csv", encoding="utf-8")

In [4]:
dct: dict[str, np.ndarray] = {}
for i in range(0, word_df.shape[0]):
    col_name: str = word_df.loc[i, "Unnamed: 0"]
    tr_vec = word_df.loc[i, "0":].to_numpy(dtype=np.float32)
    arr_list: list = []
    for j in range(0, emb_df.shape[0]):
        arr_list.append(spatial.distance.cosine(tr_vec, emb_df.loc[j, "0":].to_numpy(dtype=np.float32)))
    del j
    arr_list = np.array(arr_list)
    dct[col_name] = arr_list
    del arr_list, col_name, tr_vec
del i
df = pd.concat([df, pd.DataFrame(dct)], axis=1)

In [5]:
df.to_csv("./../../data/Meaning-2023.csv", encoding="utf-8")