In [1]:
import json
import pandas as pd
from pandas import json_normalize
import matplotlib.pyplot as plt
import os
import random
import numpy as np

# 0. Leemos los datos

In [2]:
def read_json(path):
    with open(path) as f:
        data = json.load(f)
    return data

def leer_k_archivos_json(k):
    archivos_json = [archivo for archivo in os.listdir("data") if archivo.endswith('.json')]
    random.seed(42)
    paths = random.choices(archivos_json, k=k)

    all_playlists = []
    for p in paths:
        data = read_json(f"data\{p}")
        playlist = pd.DataFrame(data['playlists'])
        all_playlists.append(playlist)

    playlists = pd.concat(all_playlists, ignore_index=True)
    return playlists

In [3]:
# Leer 15 archivos json osea 15000 playlists
# playlists = leer_k_archivos_json(1)
paths = [
    "mpd.slice.0-999.json",
]
all_playlists = []
for path in paths:
    print(path)
    data = read_json(f"/content/data/{path}")
    playlist = pd.DataFrame(data['playlists'])
    all_playlists.append(playlist)

playlists = pd.concat(all_playlists, ignore_index=True)

# playlists.head()

mpd.slice.0-999.json


In [4]:
playlists.isna().sum()

name               0
collaborative      0
pid                0
modified_at        0
num_tracks         0
num_albums         0
num_followers      0
tracks             0
num_edits          0
duration_ms        0
num_artists        0
description      980
dtype: int64

In [5]:
playlists["description"][~playlists["description"].isna()]

94                                          chilllll out
102                                                  uzi
320                                   sit back and chill
329                                    el espanish trap.
339                     roasty toasty in the holy ghosty
353                                   Always thinking...
354            What I listen to crusing on my motorcycle
370                                       merry chrysler
475                                                  sad
491                           A little bit of everything
522    Intense, high testosterone, ground-breaking, f...
526                                Buenos Días Muchachos
548                                       Bumblebee Tuna
626                               closeted hypochondriac
741    I listen to this at the moment. Try a couple s...
767                              Just a big mix of music
778                 but actually electric, teenage songs
818                            

In [6]:
def expandir_tracks(playli):
    expanded_tracks_df = pd.DataFrame()

    # Iterar sobre cada fila y expandir los datos JSON en un dataframe
    for _, row in playli.iterrows():
        # Cargar el JSON desde la columna 'tracks'
        tracks = row['tracks']

        # Comprobar si 'tracks' es una cadena que necesita ser convertida de JSON
        if isinstance(tracks, str):
            try:
                tracks = json.loads(tracks)  # Cargar la cadena JSON
            except json.JSONDecodeError:
                tracks = []  # En caso de error, usar una lista vacía
        elif not isinstance(tracks, list):
            tracks = []  # Asegurar que tracks sea una lista si no es una cadena

        if tracks:
            track_data = json_normalize(tracks)
            track_data['pid'] = row['pid']  # añadimos el id de la playlist para mantener la referencia
            expanded_tracks_df = pd.concat([expanded_tracks_df, track_data], ignore_index=True)
    return expanded_tracks_df

In [7]:
expanded_tracks_df = expandir_tracks(playlists)

In [8]:
playlists.drop("tracks", axis=1, inplace=True)
playlists.head()

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,num_edits,duration_ms,num_artists,description
0,Throwbacks,False,0,1493424000,52,47,1,6,11532414,37,
1,Awesome Playlist,False,1,1506556800,39,23,1,5,11656470,21,
2,korean,False,2,1505692800,64,51,1,18,14039958,31,
3,mat,False,3,1501027200,126,107,1,4,28926058,86,
4,90s,False,4,1401667200,17,16,2,7,4335282,16,


In [9]:
by_playlist = expanded_tracks_df.groupby('pid').agg(
    num_songs=('track_name', 'count'),  # Contamos el número de canciones por playlist
    avg_duration=('duration_ms', 'mean'),  # Calculamos la duración media de las canciones
    artists=('artist_name', lambda x: set(x)),  # Obtenemos un conjunto de artistas únicos
    songs =('track_uri', lambda x: set(x)),  # Obtenemos un conjunto de caciones únicas
    albums =('album_name', lambda x: set(x))  # Obtenemos un conjunto de albumes únicos
)

by_playlist['avg_duration'] = by_playlist['avg_duration'] / 60000 # Convertimos la duración de milisegundos a minutos
# by_playlist.head()

In [10]:
playlist_per_song = expanded_tracks_df.groupby('track_uri')['pid'].agg(list).reset_index()
# playlist_per_song.head()

- Pasamos los URL a un id de cancion

In [11]:
url_id = {}
n = 1

def key_exists(key, dictionary):
    try:
        value = dictionary[key]
        return True
    except KeyError:
        return False

def get_id(x, n):
    if not key_exists(x, url_id):
        url_id[x] = n
        n += 1
    return url_id[x], n

values = []

for val in playlist_per_song["track_uri"]:
    valu, n = get_id(val, n)
    values.append(valu)

playlist_per_song["track_id"] = values
playlist_per_song.head()

Unnamed: 0,track_uri,pid,track_id
0,spotify:track:000mA0etY38nKdvf1N04af,[371],1
1,spotify:track:000xQL6tZNLJzIrtIgxqSl,"[182, 813]",2
2,spotify:track:006AVH7fq061voGXkUiII4,[999],3
3,spotify:track:006PJvsr6CyV3JdBf7wiNF,[743],4
4,spotify:track:006yrnQMCZpiUgkR612gC8,[227],5


In [12]:
expanded_tracks_df["track_id"] = expanded_tracks_df["track_uri"].apply(lambda x: url_id[x])

In [13]:
by_playlist["track_id"] = by_playlist["songs"].apply(lambda x: [url_id[val] for val in list(x)])
# by_playlist.head()

In [14]:
playlist_per_song['n_playlists'] = playlist_per_song['pid'].apply(lambda x: len(set(x)))

In [15]:
playlist_per_song["n_playlists"].mean()

1.9371425253317074

In [16]:
cancion_mas_reproducida = playlist_per_song["n_playlists"].idxmax()
playlist_per_song.iloc[cancion_mas_reproducida]

track_uri                   spotify:track:7KXjTSCq5nL1LoYtL7XAwS
pid            [28, 53, 65, 85, 95, 123, 124, 160, 164, 178, ...
track_id                                                   32397
n_playlists                                                   52
Name: 32396, dtype: object

# 2. Baseline

In [17]:
!pip3 install implicit
# clear_outputs()

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [18]:
import implicit as implicit

In [23]:
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.evaluation import mean_average_precision_at_k, ndcg_at_k
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
#Hay playlist que tienen la misma cancion más de una vez, aca las tratamos solo una vez
playlist_track = expanded_tracks_df[["pid","track_id","track_name","duration_ms","artist_name"]].drop_duplicates()
# playlist_track.head()

- Separamos en data de validación y data de entrenamiento

In [25]:
# Acá separamos en las playlists con un número de canciones mayor a NUM_CANCIONES
# Luego nos quedamos solo con las playlist con un número mayor para hacer el entrenamiento y el testeo
NUM_CANCIONES = 10
song_per_playlist = playlist_track.groupby('pid')['track_id'].agg(list).reset_index()
song_per_playlist['n_songs'] = song_per_playlist['track_id'].apply(len)
pid_less = song_per_playlist[song_per_playlist['n_songs'] < NUM_CANCIONES]['pid'].unique()
pid_more = song_per_playlist[song_per_playlist['n_songs'] >= NUM_CANCIONES]['pid'].unique()
print(len(pid_less), len(pid_more))

35 965


In [26]:
# Acá hacemos una separación de las playlists que le vamos a entregar la información completa,
# y las que le vamos a entregar solo las primeras n

train_pid, test_pid = train_test_split(pid_more, test_size=0.2, random_state=42)
print(len(train_pid), len(test_pid))

772 193


In [27]:
song_per_playlist.drop("n_songs", axis=1, inplace=True)
song_per_playlist.head(3)

Unnamed: 0,pid,track_id
0,0,"[2209, 27709, 2383, 5177, 7784, 2437, 27044, 1..."
1,1,"[9981, 6051, 17334, 6282, 6126, 14998, 2900, 3..."
2,2,"[31321, 20465, 20655, 7274, 2349, 9120, 487, 1..."


In [28]:
# Acá separamos para las playlist que están en test sus primeras PRIMERAS_N canciones para train y el resto para train
# Hay que asegurarse que PRIMERAS_N << NUM_CANCIONES

PRIMERAS_N = 5
test_all = song_per_playlist[song_per_playlist['pid'].isin(test_pid)]
test_all['fist_5'] = test_all['track_id'].apply(lambda x: x[:PRIMERAS_N])
test_all['next'] = test_all['track_id'].apply(lambda x: x[PRIMERAS_N:])
test_all.drop("track_id", axis=1, inplace=True)
test_train = test_all[['pid', 'fist_5']].explode('fist_5')
test_test = test_all[['pid', 'next']].explode('next')
test_train.rename(columns={'fist_5': 'track_id'}, inplace=True)
test_test.rename(columns={'next': 'track_id'}, inplace=True)


# clear_outputs()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_all['fist_5'] = test_all['track_id'].apply(lambda x: x[:PRIMERAS_N])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_all['next'] = test_all['track_id'].apply(lambda x: x[PRIMERAS_N:])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_all.drop("track_id", axis=1, inplace=True)


In [29]:
# Luego acá creamos los datasets de train y test

data = playlist_track[['pid', 'track_id']]
train_data = data[data['pid'].isin(train_pid)]                       # Playlists con todas las canciones
train_data = pd.concat([train_data, test_train], ignore_index=True)  # Playlists con solo las PRIMERAS_N canciones
test_data = test_test


In [30]:
# Pequeño test para revisar cuantas canciones el modelo no se ha entrenado con, lo cual implica que no las va a poder recomendar
# Este número debería bajar si incluimos más data en el comienzo

test_tracks_ids = test_data['track_id'].unique()
train_tracks_ids = train_data['track_id'].unique()
tracks_not_in_train = np.setdiff1d(test_tracks_ids, train_tracks_ids)
tracks_not_in_train, len(tracks_not_in_train) / len(test_tracks_ids)


(array([1, 11, 13, ..., 34421, 34424, 34429], dtype=object),
 0.5343448841500923)

# New ALS

In [31]:
# Create user-item interaction matrices for train and test sets
train_interaction_matrix = csr_matrix((np.ones(len(train_data)), (train_data['pid'], train_data['track_id'])))
test_interaction_matrix = csr_matrix((np.ones(len(test_data)), (test_data['pid'], test_data['track_id'])))


model = AlternatingLeastSquares(factors=100, alpha = 20)
model.fit(train_interaction_matrix)

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

In [37]:
def R_precision(recommended, actual):
    n = len(actual)
    recommended = recommended[:n]
    in_both = np.intersect1d(recommended, actual)
    return len(in_both) / n

def dcg(relevance_scores):
    return relevance_scores[0] + np.sum(
        [rel / np.log2(idx + 1) for idx, rel in enumerate(relevance_scores[1:], start=2)]
    )

def ndcg(recommended, actual):
    """Calculate Normalized Discounted Cumulative Gain (NDCG) at k"""
    # Relevance scores: 1 if the item is in the ground truth, 0 otherwise
    relevance_scores = [1 if item in actual else 0 for item in recommended]

    # Calculate DCG for the recommended list
    DCG = dcg(relevance_scores)

    # Calculate IDCG for the ideal list (the best possible ranking)
    ideal_relevance_scores = sorted(relevance_scores, reverse=True)
    IDCG = dcg(ideal_relevance_scores)

    # Calculate NDCG
    NDCG = DCG / IDCG if IDCG > 0 else 0
    return NDCG

def rec_song_clicks(recommended, actual):
    for i, item in enumerate(recommended):
        if item in actual:
            clicks = np.floor(i / 10)
            return clicks
    return 51

In [41]:
# Pequeño código mostrando que el modelo efectivamente logra recomendar canciones relevantes para usuarios (playlists) del train
# Falta implementar las métricas, las clásicas de ndcg y r-precision y las que se crearon para la competencia


user_ids = test_pid
print(len(user_ids))
print(train_data['track_id'].nunique())
r_prec = []
ndcgs = []
clicks = []
for userid in user_ids:

  ids, scores = model.recommend(userid, train_interaction_matrix[userid], N=500, filter_already_liked_items=True)
  actual = test_test[test_test['pid'] == userid]['track_id'].values

  # print(actual)
  r_prec.append(R_precision(ids, actual))
  ndcgs.append(ndcg(ids, actual))
  clicks.append(rec_song_clicks(ids, actual))

  # in_both = np.intersect1d(ids, actual)
  # if in_both.size > 0:
  #     print(in_both)

print(np.mean(r_prec))
print(np.mean(ndcgs))
print(np.mean(clicks))



193
29130
0.06703178454492253
0.34488809769036843
10.937823834196891


# Old User User

Este lo voy a cambiar considerando el nuevo train test split y haciendolo más legible

(yo creo que con el nuevo train test split voy a poder hacerlo mucho más rapido (y entendible) pq con el anterior no sabía que hacer con playlists con las cuales no se había entrenado)

In [108]:
user_factors = model.user_factors
item_factors = model.item_factors



NDCG@10: 0.0000


# Old Item-Item 2


In [132]:
# Define a function to find similar items using cosine similarity
def find_similar_items(item_id, item_factors, num_items=10):
    item_vector = item_factors[item_id].reshape(1, -1)
    similarities = cosine_similarity(item_vector, item_factors).flatten()
    similar_items = np.argsort(similarities)[::-1][1:num_items+1]  # Skip the item itself
    return similar_items

# Define a function to infer new user vector
def infer_new_user_vector(new_user_tracks, item_factors, model):
    new_user_vector = np.zeros(model.factors)
    for track_id in new_user_tracks:
        new_user_vector += model.item_factors[track_id]
    new_user_vector /= len(new_user_tracks)  # Average the vectors
    return new_user_vector

# Define a function to calculate NDCG@10
def ndcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size == 0:
        return 0.0
    dcg = np.sum(r / np.log2(np.arange(2, r.size + 2)))
    idcg = np.sum(np.ones_like(r) / np.log2(np.arange(2, r.size + 2)))
    return dcg / idcg

# Evaluate the recommendations on the test set using NDCG@10
def evaluate_ndcg(test_data, user_factors, item_factors, model, num_items=10, k=10):
    ndcg_scores = []
    for user_id in test_data['pid'].unique():
        test_items = test_data[test_data['pid'] == user_id]['track_id'].values
        recommended_items = []

        if user_id in train_data['pid'].unique():  # Existing user
            user_tracks = train_data[train_data['pid'] == user_id]['track_id'].values
            for track_id in user_tracks:
                similar_items = find_similar_items(track_id, item_factors, num_items)
                recommended_items.extend(similar_items)
            recommended_items = np.unique(recommended_items)[:k]
        else:  # New user
            new_user_tracks = test_data[test_data['pid'] == user_id]['track_id'].values
            new_user_vector = infer_new_user_vector(new_user_tracks, item_factors, model)
            similarities = cosine_similarity(new_user_vector.reshape(1, -1), item_factors).flatten()
            recommended_items = np.argsort(similarities)[::-1][:k]

        relevance = np.isin(recommended_items, test_items).astype(int)
        ndcg_score = ndcg_at_k(relevance, k)
        ndcg_scores.append(ndcg_score)

    return np.mean(ndcg_scores)

# Compute and print the NDCG@10 score
ndcg_score = evaluate_ndcg(test_data, user_factors, item_factors, model)
print(f"NDCG@10: {ndcg_score:.4f}")


IndexError: index 31740 is out of bounds for axis 0 with size 1000