In [1]:
import json
import pandas as pd
from pandas import json_normalize
import matplotlib.pyplot as plt
import os
import random
import numpy as np

# 0. Leemos los datos

In [2]:
def read_json(path):
    with open(path) as f:
        data = json.load(f)
    return data

def leer_k_archivos_json(k):
    archivos_json = [archivo for archivo in os.listdir("data") if archivo.endswith('.json')]
    random.seed(42)
    paths = random.choices(archivos_json, k=k)

    all_playlists = []
    for p in paths:
        data = read_json(f"data\{p}")
        playlist = pd.DataFrame(data['playlists'])
        all_playlists.append(playlist)

    playlists = pd.concat(all_playlists, ignore_index=True)
    return playlists

In [4]:
paths = [
    "mpd.slice.0-999.json",
    "mpd.slice.1000-1999.json",
    "mpd.slice.2000-2999.json",
    "mpd.slice.3000-3999.json",
    "mpd.slice.4000-4999.json",
    "mpd.slice.5000-5999.json",
    "mpd.slice.6000-6999.json",
    "mpd.slice.7000-7999.json",
    "mpd.slice.8000-8999.json",
    "mpd.slice.9000-9999.json"
]
all_playlists = []
for path in paths:
    print(path)
    data = read_json(f"data/{path}")
    playlist = pd.DataFrame(data['playlists'])
    all_playlists.append(playlist)

playlists = pd.concat(all_playlists, ignore_index=True)

mpd.slice.0-999.json
mpd.slice.1000-1999.json
mpd.slice.2000-2999.json
mpd.slice.3000-3999.json
mpd.slice.4000-4999.json
mpd.slice.5000-5999.json
mpd.slice.6000-6999.json
mpd.slice.7000-7999.json
mpd.slice.8000-8999.json
mpd.slice.9000-9999.json


In [5]:
playlists.head()

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,False,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
1,Awesome Playlist,False,1,1506556800,39,23,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_...",5,11656470,21,
2,korean,False,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,False,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
4,90s,False,4,1401667200,17,16,2,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk...",7,4335282,16,


In [6]:
def expandir_tracks(playli):
    expanded_tracks_df = pd.DataFrame()

    # Iterar sobre cada fila y expandir los datos JSON en un dataframe
    for _, row in playli.iterrows():
        # Cargar el JSON desde la columna 'tracks'
        tracks = row['tracks']

        # Comprobar si 'tracks' es una cadena que necesita ser convertida de JSON
        if isinstance(tracks, str):
            try:
                tracks = json.loads(tracks)  # Cargar la cadena JSON
            except json.JSONDecodeError:
                tracks = []  # En caso de error, usar una lista vacía
        elif not isinstance(tracks, list):
            tracks = []  # Asegurar que tracks sea una lista si no es una cadena

        if tracks:
            track_data = json_normalize(tracks)
            track_data['pid'] = row['pid']  # añadimos el id de la playlist para mantener la referencia
            expanded_tracks_df = pd.concat([expanded_tracks_df, track_data], ignore_index=True)
    return expanded_tracks_df

In [7]:
expanded_tracks_df = expandir_tracks(playlists)

In [8]:
playlists.drop("tracks", axis=1, inplace=True)
playlists.head()

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,num_edits,duration_ms,num_artists,description
0,Throwbacks,False,0,1493424000,52,47,1,6,11532414,37,
1,Awesome Playlist,False,1,1506556800,39,23,1,5,11656470,21,
2,korean,False,2,1505692800,64,51,1,18,14039958,31,
3,mat,False,3,1501027200,126,107,1,4,28926058,86,
4,90s,False,4,1401667200,17,16,2,7,4335282,16,


In [9]:
by_playlist = expanded_tracks_df.groupby('pid').agg(
    num_songs=('track_name', 'count'),  # Contamos el número de canciones por playlist
    avg_duration=('duration_ms', 'mean'),  # Calculamos la duración media de las canciones
    artists=('artist_name', lambda x: set(x)),  # Obtenemos un conjunto de artistas únicos
    songs =('track_uri', lambda x: set(x)),  # Obtenemos un conjunto de caciones únicas
    albums =('album_name', lambda x: set(x))  # Obtenemos un conjunto de albumes únicos
)

by_playlist['avg_duration'] = by_playlist['avg_duration'] / 60000 # Convertimos la duración de milisegundos a minutos
# by_playlist.head()

In [10]:
playlist_per_song = expanded_tracks_df.groupby('track_uri')['pid'].agg(list).reset_index()
playlist_per_song.head()

Unnamed: 0,track_uri,pid
0,spotify:track:000GjfnQc7ggBayDiy1sLW,[5925]
1,spotify:track:000VZqvXwT0YNqKk7iG2GS,"[2093, 5244]"
2,spotify:track:000mA0etY38nKdvf1N04af,[371]
3,spotify:track:000xQL6tZNLJzIrtIgxqSl,"[182, 813, 1011, 1028, 1289, 1511, 1627, 1775,..."
4,spotify:track:0010mZpCCwlPwoBiBsjoac,[6372]


- Pasamos los URL a un id de cancion

In [11]:
url_id = {}
n = 1

def key_exists(key, dictionary):
    try:
        value = dictionary[key]
        return True
    except KeyError:
        return False

def get_id(x, n):
    if not key_exists(x, url_id):
        url_id[x] = n
        n += 1
    return url_id[x], n

values = []

for val in playlist_per_song["track_uri"]:
    valu, n = get_id(val, n)
    values.append(valu)

playlist_per_song["track_id"] = values
playlist_per_song.head()

Unnamed: 0,track_uri,pid,track_id
0,spotify:track:000GjfnQc7ggBayDiy1sLW,[5925],1
1,spotify:track:000VZqvXwT0YNqKk7iG2GS,"[2093, 5244]",2
2,spotify:track:000mA0etY38nKdvf1N04af,[371],3
3,spotify:track:000xQL6tZNLJzIrtIgxqSl,"[182, 813, 1011, 1028, 1289, 1511, 1627, 1775,...",4
4,spotify:track:0010mZpCCwlPwoBiBsjoac,[6372],5


In [12]:
expanded_tracks_df["track_id"] = expanded_tracks_df["track_uri"].apply(lambda x: url_id[x])

In [13]:
by_playlist["track_id"] = by_playlist["songs"].apply(lambda x: [url_id[val] for val in list(x)])
# by_playlist.head()

In [14]:
playlist_per_song['n_playlists'] = playlist_per_song['pid'].apply(lambda x: len(set(x)))

In [15]:
playlist_per_song["n_playlists"].mean()

3.857698028679103

# 2. Baseline

In [16]:
#Hay playlist que tienen la misma cancion más de una vez, aca las tratamos solo una vez
playlist_track = expanded_tracks_df[["pid","track_id","track_name","duration_ms","artist_name"]].drop_duplicates()
# playlist_track.head()

- Separamos en data de validación y data de entrenamiento

In [17]:
# Acá separamos en las playlists con un número de canciones mayor a NUM_CANCIONES
# Luego nos quedamos solo con las playlist con un número mayor para hacer el entrenamiento y el testeo
NUM_CANCIONES = 10
song_per_playlist = playlist_track.groupby('pid')['track_id'].agg(list).reset_index()
song_per_playlist['n_songs'] = song_per_playlist['track_id'].apply(len)
pid_less = song_per_playlist[song_per_playlist['n_songs'] < NUM_CANCIONES]['pid'].unique()
pid_more = song_per_playlist[song_per_playlist['n_songs'] >= NUM_CANCIONES]['pid'].unique()
print(len(pid_less), len(pid_more))

326 9674


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# Acá hacemos una separación de las playlists que le vamos a entregar la información completa,
# y las que le vamos a entregar solo las primeras n

train_pid, test_pid = train_test_split(pid_more, test_size=0.2, random_state=42)
print(len(train_pid), len(test_pid))

7739 1935


In [20]:
song_per_playlist.drop("n_songs", axis=1, inplace=True)
# song_per_playlist.head(3)

In [21]:
def Sep_first_n(n, pids):
    #Primero separamos las primeras n canciones de una playlist y las siguientes
    all = song_per_playlist[song_per_playlist['pid'].isin(pids)].copy()
    all.loc[:, 'fist_5'] = all['track_id'].apply(lambda x: x[:n])
    all.loc[:, 'next']= all['track_id'].apply(lambda x: x[n:])
    all.drop("track_id", axis=1, inplace=True)
    #Luego utilizamos las primeras n canciones para entrenar y las siguientes para testear
    train = all[['pid', 'fist_5']].explode('fist_5')
    test = all[['pid', 'next']].explode('next')
    train.rename(columns={'fist_5': 'track_id'}, inplace=True)
    test.rename(columns={'next': 'track_id'}, inplace=True)
    return train, test

In [22]:
# Acá separamos para las playlist que están en test sus primeras PRIMERAS_N canciones para train y el resto para train
# Hay que asegurarse que PRIMERAS_N << NUM_CANCIONES

PRIMERAS_N = 5
test_train, test_test = Sep_first_n(PRIMERAS_N, test_pid)

# clear_outputs()

In [23]:
# Luego acá creamos los datasets de train y test

data = playlist_track[['pid', 'track_id']]
train_data = data[data['pid'].isin(train_pid)]                       # Playlists con todas las canciones
train_data = pd.concat([train_data, test_train], ignore_index=True)  # Playlists con solo las PRIMERAS_N canciones
test_data = test_test


In [24]:
# Pequeño test para revisar cuantas canciones el modelo no se ha entrenado con, lo cual implica que no las va a poder recomendar
# Este número debería bajar si incluimos más data en el comienzo

test_tracks_ids = test_data['track_id'].unique()
train_tracks_ids = train_data['track_id'].unique()
tracks_not_in_train = np.setdiff1d(test_tracks_ids, train_tracks_ids)
tracks_not_in_train, len(tracks_not_in_train) / len(test_tracks_ids)


(array([10, 11, 16, ..., 170083, 170085, 170086], dtype=object),
 0.3839444354708448)

In [25]:
def R_precision(recommended, actual):
    n = len(actual)
    recommended = recommended[:n]
    in_both = np.intersect1d(recommended, actual)
    return len(in_both) / n

def dcg(relevance_scores):
    return relevance_scores[0] + np.sum(
        [rel / np.log2(idx + 1) for idx, rel in enumerate(relevance_scores[1:], start=2)]
    )

def ndcg(recommended, actual):
    """Calculate Normalized Discounted Cumulative Gain (NDCG) at k"""
    # Relevance scores: 1 if the item is in the ground truth, 0 otherwise
    relevance_scores = [1 if item in actual else 0 for item in recommended]

    # Calculate DCG for the recommended list
    DCG = dcg(relevance_scores)

    # Calculate IDCG for the ideal list (the best possible ranking)
    ideal_relevance_scores = sorted(relevance_scores, reverse=True)
    IDCG = dcg(ideal_relevance_scores)

    # Calculate NDCG
    NDCG = DCG / IDCG if IDCG > 0 else 0
    return NDCG

def rec_song_clicks(recommended, actual):
    for i, item in enumerate(recommended):
        if item in actual:
            clicks = np.floor(i / 10)
            return clicks
    return 51

# Transformer


In [26]:
from torch.utils.data import DataLoader, Subset, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

from IPython.display import clear_output, display
import os


In [27]:
data = playlist_track[['pid', 'track_id']]
vocab_size = data['track_id'].nunique() + 2

In [28]:
np.max(data['track_id'].unique()), vocab_size

(170089, 170091)

In [29]:
song_per_playlist = playlist_track.groupby('pid')['track_id'].agg(list).reset_index()
song_per_playlist = song_per_playlist[song_per_playlist['pid'].isin(train_pid)]
data_all = song_per_playlist['track_id'].tolist()
random.shuffle(data_all)
data_train, data_val = train_test_split(data_all, test_size=0.2, random_state=42)

prediction_tool = test_train.groupby('pid')['track_id'].agg(list).reset_index()
prediction_actual = test_test.groupby('pid')['track_id'].agg(list).reset_index()


In [30]:
# Calculate lengths of each sublist
lengths = [len(sublist) for sublist in data_train]

# Calculate average length
average_length = np.mean(lengths)

# Calculate median length
median_length = np.median(lengths)

# Calculate standard deviation of lengths
std_dev_length = np.std(lengths)

# Calculate minimum and maximum length
min_length = np.min(lengths)
max_length = np.max(lengths)

print(f"Average Length: {average_length}")
print(f"Median Length: {median_length}")
print(f"Standard Deviation of Lengths: {std_dev_length}")
print(f"Minimum Length: {min_length}")
print(f"Maximum Length: {max_length}")

Average Length: 66.8974317557745
Median Length: 50.0
Standard Deviation of Lengths: 51.80512461821629
Minimum Length: 10
Maximum Length: 250


In [31]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 50 # what is the maximum context length for predictions?
max_iters = 1000
eval_interval = 100
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 5
n_embd = 100
n_head = 6
n_layer = 3
dropout = 0.2
# ------------

In [32]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

class SequenceDataset(Dataset):
    def __init__(self, sequences, block_size):
        self.sequences = sequences
        self.block_size = block_size

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        if len(sequence) > self.block_size:
            # Truncate sequence
            sequence = sequence[:self.block_size]
        else:
            # Pad sequence with zeros
            sequence = sequence + [0] * (self.block_size - len(sequence))
        return torch.tensor(sequence, dtype=torch.long)


# Create dataset
dataset = SequenceDataset(data_train, block_size)

# DataLoader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

dataset_test = SequenceDataset(data_val, block_size)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

# Iterate through DataLoader
for batch in dataloader:
    print("Batch:")
    print(batch.size())
    break

for batch in dataloader_test:
    print("Batch:")
    print(batch.size())
    break


Batch:
torch.Size([64, 50])
Batch:
torch.Size([64, 50])


### Model:

In [33]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            # print(probs.size())
            probs_no_0 = probs[:, 1:]
            # print(probs_no_0.size())
            idx_next = torch.multinomial(probs_no_0, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

### Training

In [39]:
model = GPTLanguageModel()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

N_EPOCH = 10
for epoch in range(N_EPOCH):
    loss_per_epoch = 0
    loss_per_epoch_test = 0
    for i, batch in enumerate(dataloader):
        # sample a batch of data'
        max_len = batch.shape[1]
        max_len = min(max_len, block_size) - 2
        xb = batch[:, :max_len]
        yb = batch[:, 1:max_len+1]
        # print(xb.shape, yb.shape)
        xb, yb = xb.to(device), yb.to(device)

        logits, loss = model(xb, yb)

        loss_per_epoch += loss.item()

        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        # if i % (len(dataloader) // 3) == 0:
            # print(f'{i}: i')

    with torch.no_grad():
        model.eval()
        for batch in dataloader_test:
            max_len = batch.shape[1]
            max_len = min(max_len, block_size) - 2
            xb = batch[:, :max_len]
            yb = batch[:, 1:max_len+1]
            xb, yb = xb.to(device), yb.to(device)

            logits, loss = model(xb, yb)
            loss_per_epoch_test += loss.item()
    model.train()

    print(f"Epoch {epoch+1}, loss_train: {loss_per_epoch/len(dataloader)}")
    print(f"Epoch {epoch+1}, loss_val: {loss_per_epoch_test/len(dataloader_test)}")

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

In [35]:
context = torch.randint(vocab_size, (1, 1), dtype=torch.long, device=device)
print(context.size())
canciones = [1, 32, 45, 1, 30, 200, 1500]
context = torch.tensor(canciones, dtype=torch.long, device=device).view(1, -1)
print(context.size())

print(model.generate(context, max_new_tokens=5)[0].tolist())


torch.Size([1, 1])
torch.Size([1, 7])
[1, 32, 45, 1, 30, 200, 1500, 74953, 116843, 92327, 144998, 102885]


In [None]:
# prediction_tool = test_train.groupby('pid')['track_id'].agg(list).reset_index()
# prediction_actual = test_test.groupby('pid')['track_id'].agg(list).reset_index()

In [37]:
prediction_tool

Unnamed: 0,pid,track_id
0,0,"[10960, 137485, 11769, 25800, 38892]"
1,3,"[98842, 29506, 147749, 40702, 119495]"
2,8,"[57157, 30096, 122304, 55565, 52690]"
3,10,"[997, 78063, 3290, 70866, 95808]"
4,13,"[21813, 65510, 34644, 79425, 53186]"
...,...,...
1930,9974,"[8801, 125869, 169685, 144891, 42292]"
1931,9989,"[26970, 16818, 19679, 129511, 129223]"
1932,9993,"[149867, 62762, 63923, 102319, 33287]"
1933,9996,"[169882, 85093, 66888, 46306, 147740]"


In [65]:
user_ids = test_pid
print(len(user_ids))
print(train_data['track_id'].nunique())
r_prec = []
ndcgs = []
clicks = []

for i, userid in enumerate(user_ids[:50]):
  print(f"User {i+1}/{len(user_ids)}")
  canciones = prediction_tool[prediction_tool['pid'] == userid]['track_id'].values[0]
  context = torch.tensor(canciones, dtype=torch.long, device=device).view(1, -1)

  ids = model.generate(context, max_new_tokens=500)[0].tolist()[5:]
  actual = test_test[test_test['pid'] == userid]['track_id'].values

  r_prec.append(R_precision(ids, actual))
  ndcgs.append(ndcg(ids, actual))
  clicks.append(rec_song_clicks(ids, actual))


print(f"R prec: {np.mean(r_prec)}, NDCG: {np.mean(ndcgs)}, Clicks: {np.mean(clicks)}")

1935
148285
User 1/1935
User 2/1935
User 3/1935
User 4/1935
User 5/1935
User 6/1935
User 7/1935
User 8/1935
User 9/1935
User 10/1935
User 11/1935
User 12/1935
User 13/1935
User 14/1935
User 15/1935
User 16/1935
User 17/1935
User 18/1935
User 19/1935
User 20/1935
User 21/1935
User 22/1935
User 23/1935
User 24/1935
User 25/1935
User 26/1935
User 27/1935
User 28/1935
User 29/1935
User 30/1935
User 31/1935
User 32/1935
User 33/1935
User 34/1935
User 35/1935
User 36/1935
User 37/1935
User 38/1935
User 39/1935
User 40/1935
User 41/1935
User 42/1935
User 43/1935
User 44/1935
User 45/1935
User 46/1935
User 47/1935
User 48/1935
User 49/1935
User 50/1935
R prec: 0.0007669248195563985, NDCG: 0.029959757752559588, Clicks: 44.94


- Playlist en base a 5 canciones que les gusten

In [66]:
pares = expanded_tracks_df[["artist_name","track_name","track_uri","track_id"]].drop_duplicates().sort_values("track_id")
pares.head()

Unnamed: 0,artist_name,track_name,track_uri,track_id
397687,El Poder De Zacatecas,Abeja Miope,spotify:track:000GjfnQc7ggBayDiy1sLW,1
140100,The Ghost Inside,Mercy,spotify:track:000VZqvXwT0YNqKk7iG2GS,2
23657,The Coronas,If I Gave Myself To Someone Else,spotify:track:000mA0etY38nKdvf1N04af,3
11848,ZAYN,Still Got Time,spotify:track:000xQL6tZNLJzIrtIgxqSl,4
427744,Bombay Bicycle Club,It's Alright Now,spotify:track:0010mZpCCwlPwoBiBsjoac,5


In [59]:
pares[pares["artist_name"]=="Post Malone"]

Unnamed: 0,artist_name,track_name,track_uri,track_id
514,Post Malone,Big Lie,spotify:track:02opp1cycqiFNDpLd2o1J3,997
9394,Post Malone,Deja Vu,spotify:track:0H8XeaJunhvpBdBFIYi6Sh,6252
4089,Post Malone,"Yours Truly, Austin Post",spotify:track:0LpiKjWMfZTkPPHonlM8nB,7912
54906,Post Malone,Cold,spotify:track:1QWmKmqhv5zcsS3v45FNl0,31385
11784,Post Malone,Money Made Me Do It,spotify:track:1ysAvOdJgUjc6CqOQxepaz,43321
22354,Post Malone,Congratulations - Remix,spotify:track:2YK01AIIWuywG24gsbuizE,55859
4090,Post Malone,Up There,spotify:track:2rKmNEYrQxaOPZrOWKZpOc,62647
31624,Post Malone,Feeling Whitney,spotify:track:35r28RDot7nPE7y9K9H7l0,67662
3042,Post Malone,Congratulations,spotify:track:3a1lNhkSLSkpJE4MSHpDu9,78292
87535,Post Malone,Too Young,spotify:track:3vdKpSTx6Q1XJp3khDoMkz,85914


In [53]:
trackname_id = {(row['artist_name'], row['track_name']): row['track_id'] for _, row in pares.iterrows()}
id_trackname = {row['track_id']: (row['artist_name'], row['track_name']) for _, row in pares.iterrows()}

In [67]:
selecting = True
songs = 0

songs_ids= []
while selecting:
    
    artist = input("Ingrese el nombre del artista: ")
    song = input("Ingrese el nombre de la canción: ")

    if (artist, song) in trackname_id:
        print("La canción está en el dataset")
        songs_ids.append(trackname_id[(artist, song)])
        songs += 1
    else:
        print("La canción no está en el dataset")
    if songs == 5:
        selecting = False

La canción no está en el dataset
La canción no está en el dataset
La canción no está en el dataset
La canción no está en el dataset
La canción no está en el dataset
La canción no está en el dataset
La canción no está en el dataset
La canción está en el dataset
La canción no está en el dataset
La canción está en el dataset
La canción está en el dataset
La canción está en el dataset
La canción está en el dataset


In [68]:
context = torch.tensor(songs_ids, dtype=torch.long, device=device).view(1, -1)

ids = model.generate(context, max_new_tokens=500)[0].tolist()[5:]

In [69]:
[id_trackname[x] for x in ids][:15]

[('Moein', 'Reng (Raghs)'),
 ('MØ', 'Never Wanna Know'),
 ('Chelsea Jade', 'Low Brow'),
 ('DJ Masterhouse', 'Night Without Sleep - Radio Edit'),
 ('Air', "La femme d'argent"),
 ('Cardiff Brothers', "I've Been Thinkin"),
 ('Irma Thomas', 'Straight From The Heart'),
 ('The Vryll Society', 'Coshh'),
 ('Jeff Williams', 'Not Fall in Love with You'),
 ('Thomas Bergersen', 'Children of the Sun (feat. Merethe Soltvedt)'),
 ('Dylan Gardner', 'Sign Language'),
 ('Steve Aoki', 'Freak - feat. Steve Bays'),
 ('Classixx', 'Hanging Gardens'),
 ('Chingo Bling', 'Classic Man (feat. Baby Bash)'),
 ('T.I.', 'Bankhead (feat. P$C & Young Dro)')]