In [None]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("./data/data.csv")
genre_data = pd.read_csv('./data/data_by_genres.csv')
year_data = pd.read_csv('./data/data_by_year.csv')

In [None]:
import numpy as np
import pandas as pd

# Use the song 'id' as the unique identifier
songs = data['id'].unique()
num_items = len(songs)

# Simulate synthetic user ratings:
# We create ratings for 1000 users; each user rates between 10 and 50 songs.
num_users = 100
np.random.seed(42)  # for reproducibility
ratings_list = []


for user in range(num_users):
    num_ratings = np.random.randint(10, 51)  # each user rates 10-50 songs
    rated_songs = np.random.choice(songs, size=num_ratings, replace=False)
    for song in rated_songs:
        # Use the song's popularity (scaled and with added noise) as a proxy for a rating.
        song_popularity = data.loc[data['id'] == song, 'popularity'].values[0]
        rating = song_popularity / 20 + np.random.normal(0, 0.5)
        # Clip rating between 1 and 5.
        rating = np.clip(rating, 1, 5)
        ratings_list.append((user, song, rating))

# Create a DataFrame for ratings with columns: user, song, rating
ratings_df = pd.DataFrame(ratings_list, columns=['user', 'song', 'rating'])

# Create a mapping from song id to column index
song2idx = {song: idx for idx, song in enumerate(songs)}

# Build the ratings matrix R (users x items)
R = np.zeros((num_users, num_items))
for _, row in ratings_df.iterrows():
    u = int(row['user'])
    i = song2idx[row['song']]
    R[u, i] = row['rating']

# --- ALS Parameters ---
num_factors = 10      # Number of latent factors
num_iterations = 10   # Number of ALS iterations
lambda_reg = 0.1      # Regularization parameter

# Initialize latent factor matrices for users (U) and items (V) with random values
U = np.random.rand(num_users, num_factors)
V = np.random.rand(num_items, num_factors)

# Identity matrix for regularization term
I = np.eye(num_factors)

# --- ALS Algorithm ---
for iteration in range(num_iterations):
    # Update user latent factors
    for u in range(num_users):
        # Get indices of items rated by user u
        rated_idx = R[u, :] > 0
        if np.sum(rated_idx) == 0:
            continue
        V_u = V[rated_idx, :]      # item factors for items rated by u
        r_u = R[u, rated_idx]      # corresponding ratings
        # Solve for U[u] in: (V_u^T V_u + lambda * I) * U[u] = V_u^T r_u
        A = V_u.T.dot(V_u) + lambda_reg * I
        b = V_u.T.dot(r_u)
        U[u, :] = np.linalg.solve(A, b)

    # Update item latent factors
    for i in range(num_items):
        # Get indices of users who rated item i
        rated_idx = R[:, i] > 0
        if np.sum(rated_idx) == 0:
            continue
        U_i = U[rated_idx, :]      # user factors for users who rated i
        r_i = R[rated_idx, i]      # corresponding ratings
        # Solve for V[i] in: (U_i^T U_i + lambda * I) * V[i] = U_i^T r_i
        A = U_i.T.dot(U_i) + lambda_reg * I
        b = U_i.T.dot(r_i)
        V[i, :] = np.linalg.solve(A, b)

    # Optional: Compute the training RMSE for monitoring convergence.
    predictions = U.dot(V.T)
    mask = R > 0
    mse = np.sum((mask * (R - predictions))**2) / np.sum(mask)
    rmse = np.sqrt(mse)
    print(f"Iteration {iteration+1}/{num_iterations}, RMSE: {rmse:.4f}")

# After training, U and V can be multiplied to predict ratings:
predicted_ratings = U.dot(V.T)

# Example: Recommend top 10 songs for a given user (user 0)
def recommend_songs(user_id, top_n=10):
    user_ratings = predicted_ratings[user_id, :]
    # Get indices of songs the user has not rated yet
    unrated_idx = np.where(R[user_id, :] == 0)[0]
    # Sort the unrated songs by predicted rating
    recommended_idx = unrated_idx[np.argsort(user_ratings[unrated_idx])[::-1]]
    top_indices = recommended_idx[:top_n]
    # Map indices back to song IDs
    idx2song = {idx: song for song, idx in song2idx.items()}
    recommendations = [idx2song[i] for i in top_indices]
    return recommendations
    


# Example usage:
print("Top recommendations for user 0:")
print(recommend_songs(user_id=0, top_n=10))



In [None]:
print("User matrix (U) shape:", U.shape)
print("Song matrix (V) shape:", V.shape)