In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

def assign_clusters(df, numerical_features, num_clusters=10):
    """
    Assign clusters to the dataset using KMeans clustering.
    
    Parameters:
        df (DataFrame): Dataset containing song features.
        numerical_features (list): List of numerical feature columns.
        num_clusters (int): Number of clusters to create.

    Returns:
        DataFrame: Updated dataset with a "Cluster" column.
    """
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df[numerical_features])
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    df['Cluster'] = kmeans.fit_predict(df_scaled)
    return df

def recommend_songs(song_name, df, numerical_features, num_recommendations=5):
    """
    Recommend similar songs based on cosine similarity within the same cluster.

    Parameters:
        song_name (str): Name of the input song.
        df (DataFrame): Dataset containing song details, features, and cluster information.
        numerical_features (list): List of numerical feature columns used for similarity calculation.
        num_recommendations (int): Number of recommendations to return.

    Returns:
        DataFrame: Recommended songs with their details.
    """
    # Validate input song
    if song_name not in df["name"].values:
        raise ValueError(f"'{song_name}' does not exist in the dataset.")
    
    # Get the cluster of the input song
    song_cluster = df.loc[df["name"] == song_name, "Cluster"].values[0]
    
    # Filter songs from the same cluster
    same_cluster_songs = df[df["Cluster"] == song_cluster]
    
    # Check if there are enough songs in the cluster
    if same_cluster_songs.shape[0] <= 1:
        raise ValueError(f"Not enough songs in the cluster to recommend similar songs for '{song_name}'.")
    
    # Extract numerical features and calculate similarity
    cluster_features = same_cluster_songs[numerical_features].values
    similarity = cosine_similarity(cluster_features, cluster_features)
    
    # Find the index of the input song within the cluster
    song_index = same_cluster_songs[same_cluster_songs["name"] == song_name].index[0]
    row_index = same_cluster_songs.index.get_loc(song_index)
    
    # Get top recommendations excluding the input song
    similar_songs_indices = np.argsort(similarity[row_index])[-(num_recommendations + 1):-1][::-1]
    
    # Retrieve song details for recommendations
    recommendations = same_cluster_songs.iloc[similar_songs_indices][["name", "year", "artists"]]
    
    return recommendations.reset_index(drop=True)

# Example Usage
# Load your dataset
df = pd.read_csv("data.csv")

# Define numerical features used for clustering and similarity calculation
numerical_features = [
    "valence", "danceability", "acousticness", "energy", 
    "instrumentalness", "liveness", "loudness", "speechiness", 
    "tempo", "duration_ms"
]

# Assign clusters to the dataset
df = assign_clusters(df, numerical_features, num_clusters=10)

# Recommend songs based on an input song
try:
    input_song = "Not like us"  # Replace with an actual song name from your dataset
    recommendations = recommend_songs(input_song, df, numerical_features, num_recommendations=5)
    
    # Save the recommendations to a CSV file
    recommendations.to_csv("song_recommendations.csv", index=False)
    
    print(f"\nSongs similar to '{input_song}' have been saved to 'song_recommendations.csv'.")
except Exception as e:
    print(f"Error: {e}")


Error: 'Not like us' does not exist in the dataset.
