In [7]:
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer

In [8]:
# Set random seed for everything (except sklearn)
RND_SEED: int = 12345
np.random.seed(RND_SEED) # for numpy, scipy
pd.core.common.random_state(RND_SEED) # for pandas
torch.set_default_device("cuda")
torch.manual_seed(RND_SEED)

# Resolution for graph images
WIDTH: int = 1366
HEIGHT: int = 768

In [9]:
df = pd.read_csv("./../data/Combined-2023.csv", encoding="utf-8", index_col=[0])
df["track_name"] = df["track_name"].astype("string")
df["streams"] = df["streams"].astype(str).apply(lambda x: float(x) / 1e6 if x.isdigit() else np.nan)
df["in_deezer_playlists"] = df["in_deezer_playlists"].astype(str).apply(lambda x: float(x.replace(",", "")) / 1000)
df["in_shazam_charts"] = df["in_shazam_charts"].astype(str).apply(
    lambda x: int(float(x.replace(",", ""))) if x.isdigit() else pd.NA
).astype("Int64")
df["key"] = df["key"].astype("category")
df["mode"] = df["mode"].astype("category")

In [10]:
model = SentenceTransformer("sentence-transformers/LaBSE", device="cuda")



In [5]:
res = pd.DataFrame(model.encode(df.loc[:, "track_name"]))
res.to_csv("./../data/WordEmb-Name-2023.csv", encoding="utf-8", index=False)

In [12]:
word_list: list[str] = ["love", "hurt", "goodbye", "friend", "hopeless", "memory", "play", "sad"]
res = pd.DataFrame(model.encode(word_list))
res.index = word_list
res.to_csv("./../data/WordList.csv", encoding="utf-8")