In [1]:
import requests
from dotenv import load_dotenv
import os
import json
import base64   
import pandas as pd
import librosa
import numpy as np
import lyricsgenius
import langdetect
import re
import string
import tempfile
from tqdm import tqdm  # Import tqdm for progress bar
from datetime import datetime, timedelta

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.optimizers import Adam

In [2]:
# For any api we can try using the "+" email trick to get more API keys

genius_client_id = "wZZ2RWc5mqp-5Pbz2W1rQJWE8LQ3pFBrb1Hw5_AOqgybq28mt7kjdjcG4zktCNbO"
genius_client_secret = "PefqBJHor_muDgTutGlaXXaxmzsI7TQCps9FQ3FwkUTT0WJIT3s0A5YA9mnFbfp_-CBhQF7b0omgE8kaM3dJ3w"
genius_access_token = "NUHHVpwnmbDYUYw8Padu0gQeHvYN4OsKYtE2MKNUpBUI6yR-xZXKY6S5NvCnFbiP"

lastfm_api_key = "97d5a64d5ba4a8bc580b752ceff3b87f"
lastfm_secret = "35175090bd61f6f16ac607bd26e5b1de"

The next 2 codeblocks are for extracting data from lastfm. These pertain to User data, where we want to get their recent tracks from up to one month ago. We limit it to 100 tracks per user. This is run on unique users from the user-song dataset

In [30]:
base_url = 'http://ws.audioscrobbler.com/2.0/'

def lastfm_get(payload):
    headers = {'user-agent': 'DataCollectorBot'}
    payload['api_key'] = lastfm_api_key
    payload['format'] = 'json'
    response = requests.get(base_url, headers=headers, params=payload)
    return response.json()


def get_recent_tracks(user):
    payload = {'method': 'user.getrecenttracks', 'user': user}
    return lastfm_get(payload)

def get_weekly_artist_chart(user):
    payload = {'method': 'user.getweeklyartistchart', 'user': user}
    return lastfm_get(payload)

def get_weekly_track_chart(user):
    payload = {'method': 'user.getweeklytrackchart', 'user': user}
    return lastfm_get(payload)

In [5]:
def get_one_month_ago_timestamp():
    one_month_ago = datetime.now() - timedelta(days=30)
    return int(one_month_ago.timestamp())

def recent_tracks_for_user_to_df(user, min_tracks=50, max_tracks=100):
    from_timestamp = get_one_month_ago_timestamp()
    
    payload = {
        'method': 'user.getrecenttracks',
        'user': user,
        'from': from_timestamp,
        'limit': max_tracks 
    }
    
    recent_tracks = lastfm_get(payload)
    tracks_list = []
    
    if 'track' in recent_tracks.get('recenttracks', {}):
        for track in recent_tracks['recenttracks']['track']:
            if 'date' in track: 
                track_info = {
                    'User': user, 
                    'Artist': track['artist']['#text'],
                    'Track Name': track['name'],
                    'Timestamp': track['date']['uts']
                }
                tracks_list.append(track_info)

    df = pd.DataFrame(tracks_list)
    return df

def recent_tracks_all_users_to_df(users):
    all_tracks_dfs = [] 
    total_users = len(users)
    
    with tqdm(total=total_users, desc="Processing Users", unit="user") as pbar:
        for user in users:
            df = recent_tracks_for_user_to_df(user)
            all_tracks_dfs.append(df)
            pbar.update(1) 
    
    combined_df = pd.concat(all_tracks_dfs, ignore_index=True)
    
    return combined_df


df = pd.read_csv('../../Downloads/user_songs_filtered.csv')
users = df["Username"].unique()
combined_tracks_df = recent_tracks_all_users_to_df(users)
combined_tracks_df.to_excel("../../Downloads/Users_Songs_Timestamps.xlsx")

Processing Users: 100%|██████████| 9483/9483 [2:08:50<00:00,  1.23user/s]  


In [6]:
def list_to_df(data_list, columns):
    if data_list:
        df = pd.DataFrame(data_list, columns=columns)
        return df
    else:
        return pd.DataFrame(columns=columns)

def get_weekly_artist_chart_df(user):
    result = get_weekly_artist_chart(user)
    artists = []
    if 'weeklyartistchart' in result and 'artist' in result['weeklyartistchart']:
        for item in result['weeklyartistchart']['artist']:
            artists.append({
                'Artist': item['name'],
                'Play Count': item['playcount']
            })
    return list_to_df(artists, ['Artist', 'Play Count'])

def get_weekly_track_chart_df(user):
    result = get_weekly_track_chart(user)
    tracks = []
    if 'weeklytrackchart' in result and 'track' in result['weeklytrackchart']:
        for item in result['weeklytrackchart']['track']:
            tracks.append({
                'Track Name': item['name'],
                'Artist': item['artist']['#text'],
                'Play Count': item['playcount']
            })
    return list_to_df(tracks, ['Track Name', 'Artist', 'Play Count'])

Here, we are concatenating the data, then splitting it into sequences of length 3 after grouping them by user and then sorting by timestamp.

In [3]:
# df = combined_tracks_df[:100000]
df = pd.read_excel("../../Downloads/Users_Songs_Timestamps.xlsx")

import numpy as np
from sklearn.model_selection import train_test_split

df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df['Time_of_Day'] = df['Timestamp'].dt.hour
df['Artist_Track'] = df['Artist'].astype(str) + ' - ' + df['Track Name'].astype(str)

time_of_day_encoded = pd.get_dummies(df['Time_of_Day'], prefix='hour')

label_encoder = LabelEncoder()

df['Artist_Track_Encoded'] = label_encoder.fit_transform(df['Artist_Track'])

df = pd.concat([df, time_of_day_encoded], axis=1)

sequence_length = 3
vocab_size = len(label_encoder.classes_)


X_seq_list, y_seq_list = [], []

for _, group in df.groupby('User'):
    group = group.sort_values('Timestamp')
    
    for i in range(len(group) - sequence_length + 1):
        artist_track_sequence = group['Artist_Track_Encoded'].iloc[i:i + sequence_length - 1].values
        
        time_features_sequence = group[time_of_day_encoded.columns].iloc[i:i + sequence_length - 1].values.reshape((sequence_length - 1) * len(time_of_day_encoded.columns))
        
        sequence = np.hstack([artist_track_sequence, time_features_sequence])
        
        label = group['Artist_Track_Encoded'].iloc[i + sequence_length - 1]
        
        X_seq_list.append(sequence)
        y_seq_list.append(label)

X_seq = np.array(X_seq_list)
# y_seq = to_categorical(y_seq_list, num_classes=vocab_size)
y_seq = np.array(y_seq_list) # Integer instead of one hot encoding


X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, shuffle=False)

num_artist_track_features = sequence_length - 1  
num_time_features = 24 * (sequence_length - 1)  

X_train_artist_track = X_train[:, :num_artist_track_features]  
X_train_time_features = X_train[:, num_artist_track_features:] 

X_test_artist_track = X_test[:, :num_artist_track_features] 
X_test_time_features = X_test[:, num_artist_track_features:]  


In [4]:
num_time_features_actual = X_train_time_features.shape[1]

num_time_features = num_time_features_actual
print("Updated number of time features:", num_time_features)


Updated number of time features: 48


# LSTM


In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dropout, Dense, Bidirectional, BatchNormalization, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.mixed_precision import set_global_policy
from tensorflow.keras.models import load_model

set_global_policy('mixed_float16')

early_stopping = EarlyStopping(
    monitor='val_loss',  
    patience=10,  
    restore_best_weights=True 
)

artist_track_input = Input(shape=(sequence_length-1,), dtype='int32', name='artist_track_input')
time_features_input = Input(shape=(num_time_features,), name='time_features_input')  

embedding_layer = Embedding(input_dim=vocab_size + 1, output_dim=50, input_length=sequence_length-1)(artist_track_input)
lstm_layer = LSTM(40, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)

time_dense_layer = Dense(40, activation='relu')(time_features_input)

combined = concatenate([lstm_layer, time_dense_layer])

x = Dropout(0.5)(combined)
x = BatchNormalization()(x)
x = Dense(100, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01))(x)
x = Dropout(0.5)(x)
output = Dense(vocab_size, activation='softmax')(x)

model = Model(inputs=[artist_track_input, time_features_input], outputs=output)

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.summary()




The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 artist_track_input (InputLayer  [(None, 2)]         0           []                               
 )                                                                                                
                                                                                                  
 embedding_1 (Embedding)        (None, 2, 50)        15286150    ['artist_track_input[0][0]']     
                                                                                                  
 time_features_input (InputLaye  [(None, 48)]        0           []                               
 r)           

In [7]:
def data_generator(X_artist_track, X_time_features, y, batch_size):

    num_samples = X_artist_track.shape[0]
    while True: 
        for offset in range(0, num_samples, batch_size):
            batch_X_artist_track = X_artist_track[offset:offset+batch_size]
            batch_X_time_features = X_time_features[offset:offset+batch_size]
            batch_y = y[offset:offset+batch_size]
            
            
            yield [batch_X_artist_track, batch_X_time_features], batch_y
            
import tensorflow as tf



In [10]:
batch_size = 128
steps_per_epoch = np.ceil(X_train_artist_track.shape[0] / batch_size)

train_generator = data_generator(X_train_artist_track, X_train_time_features, y_train, batch_size)
validation_generator = data_generator(X_test_artist_track, X_test_time_features, y_test, batch_size)

history = model.fit(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    epochs=10,
    validation_data=validation_generator,
    validation_steps=np.ceil(X_test_artist_track.shape[0] / batch_size), 
    callbacks=[early_stopping]
)

Epoch 1/10


KeyboardInterrupt: 

In [12]:
model.save('lstm_model')

# model = load_model('lstm_model')



INFO:tensorflow:Assets written to: lstm_model\assets


INFO:tensorflow:Assets written to: lstm_model\assets


In [13]:

y_pred_prob = model.predict([X_test_artist_track, X_test_time_features])
y_true = y_test
k = 30
top_k_indices = np.argsort(y_pred_prob, axis=1)[:, -k:][:, ::-1]  

y_pred_top_k = label_encoder.inverse_transform(top_k_indices.flatten()).reshape(top_k_indices.shape)
y_true_names = label_encoder.inverse_transform(y_true)

# binary_relevance = np.array([[1 if label in pred[:k] else 0 for label in y_true_names] for pred in y_pred_top_k])
# predicted_scores = np.random.rand(k)  

all_positives = len(y_true)



MemoryError: Unable to allocate 1.43 GiB for an array with shape (19594, 19594) and data type int32

In [27]:
def calculate_top_30_accuracy_and_print_recommendations(y_pred_prob, y_true, label_encoder):
    
    k = 30
    top_k_indices = np.argsort(y_pred_prob, axis=1)[:, -k:][:, ::-1] 
    top_k_accuracy_list = []

    for i, (top_k, true) in enumerate(zip(top_k_indices, y_true)):
        hit = true in top_k
        top_k_accuracy_list.append(int(hit))

        recommended_names = label_encoder.inverse_transform(top_k)
        true_name = label_encoder.inverse_transform([true])[0]


    top_30_accuracy = np.mean(top_k_accuracy_list)
    print(f"Top-30 Accuracy: {top_30_accuracy*100:.2f}%")
    
    return top_30_accuracy

def compute_hit_rate_at_k(y_true, y_pred_top_k, k):

    hits = 0
    for true, pred in zip(y_true, y_pred_top_k):
        if true in pred[:k]:
            hits += 1
    return hits / len(y_true)

def precision_at_30(y_true, y_pred_top_k):
    correct_predictions = sum(1 for true, pred in zip(y_true, y_pred_top_k) if true in pred[:30])
    return correct_predictions / len(y_pred_top_k)

def recall_at_30(y_true, y_pred_top_k):
    hits = sum(1 for true, pred in zip(y_true, y_pred_top_k) if true in pred[:30])
    total_relevant = len(y_true)  
    return hits / total_relevant


def average_precision_at_k(y_true, y_score, k=30):
    y_true = np.asarray(y_true)[:k]
    y_score = np.asarray(y_score)[:k]

    if not y_true.any():
        return 0

    score = 0
    num_hits = 0
    for i, (p, rel) in enumerate(zip(y_score, y_true), 1):
        if rel:
            num_hits += 1
            score += num_hits / i
    return score / np.sum(y_true)

def apk(actual, predicted, k=30):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=30):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])



hit_rate = compute_hit_rate_at_k(y_true_names, y_pred_top_k, k)
print(f"Hit Rate at {k}: {hit_rate:.4f}")
precision_30 = precision_at_30(y_true_names, y_pred_top_k)
print(f"Precision@30: {precision_30:.4f}")
recall_30 = recall_at_30(y_true_names, y_pred_top_k)
print(f"Recall@30: {recall_30:.4f}")


map_score = mapk([[y] for y in y_true_names], y_pred_top_k, k=30)
print(f"MAP: {map_score:.4f}")

Hit Rate at 30: 0.0469
Precision@30: 0.0469
Recall@30: 0.0469
MAP: 0.0070


In [26]:

def precompute_logarithms(k):
    return np.log2(np.arange(2, k + 2))

def calculate_batch_ndcg(y_true_batch, y_pred_prob_batch, k, precomputed_logs):
    top_k_indices = np.argpartition(y_pred_prob_batch, -k)[:, -k:]
    ndcg_scores = []

    for true_label, indices in zip(y_true_batch, top_k_indices):
        sorted_indices = np.argsort(-y_pred_prob_batch[np.arange(len(indices)), indices])
        is_relevant = (true_label == indices[sorted_indices]).astype(int)
        
        dcg = np.sum((2**is_relevant - 1) / precomputed_logs[sorted_indices])
        idcg = np.sum((2**1 - 1) / precomputed_logs[:np.sum(is_relevant)])
        ndcg_score = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg_score)

    return np.mean(ndcg_scores)

def calculate_ndcg_in_batches(y_true, y_pred_prob, k=30, batch_size=1000):
    num_samples = y_true.shape[0]
    precomputed_logs = precompute_logarithms(k)
    ndcg_scores = []

    for start_idx in range(0, num_samples, batch_size):
        end_idx = start_idx + batch_size
        batch_ndcg_score = calculate_batch_ndcg(
            y_true[start_idx:end_idx],
            y_pred_prob[start_idx:end_idx],
            k,
            precomputed_logs
        )
        ndcg_scores.append(batch_ndcg_score)

    mean_ndcg = np.mean(ndcg_scores)
    return mean_ndcg

mean_ndcg_score = calculate_ndcg_in_batches(y_true, y_pred_prob, k=30, batch_size=1000)
print(f"Mean NDCG@30: {mean_ndcg_score:.4f}")


Mean NDCG@30: 0.0128
