In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load the dataset
df = pd.read_csv(r"dataset.csv")

# Display the first few rows
print(df.head())

   Unnamed: 0                track_id                 artists  \
0           0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1           1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2           2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3           3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4           4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic)   
2                                     To Begin Again   
3  Crazy Rich Asians (Original Motion Picture Sou...   
4                                            Hold On   

                   track_name  popularity  duration_ms  explicit  \
0                      Comedy          73       230666     False   
1            Ghost - Acoustic          55       149610     False   
2              To Begin Again          57       210826     False   


In [3]:
df_sampled = df.sample(n=5000, random_state=42)  # Adjust n as needed
# Reset the index of the sampled dataset
df_sampled = df_sampled.reset_index(drop=True)

# Check the new index
print("Sampled dataset indices:", df_sampled.index)

Sampled dataset indices: RangeIndex(start=0, stop=5000, step=1)


In [4]:
# Drop unnecessary columns
df = df.drop(columns=['track_id', 'artists', 'album_name', 'track_name', 'track_genre'])

In [5]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values (if any)
df = df.dropna()

Unnamed: 0          0
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
dtype: int64


In [6]:
# Initialize the scaler
scaler = MinMaxScaler()

# Normalize numerical columns
numerical_cols = ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 
                  'speechiness', 'acousticness', 'instrumentalness', 'liveness', 
                  'valence', 'tempo']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [7]:
# Feature matrix
X = df.drop(columns=['mode'])  # Drop the target column (if any)

In [8]:
# Sample a smaller subset of the dataset
df_sampled = df.sample(n=5000, random_state=42)  # Adjust n as needed
X_sampled = df_sampled.drop(columns=['mode'])


In [9]:
cosine_sim = cosine_similarity(X_sampled, X_sampled)

# Convert to a DataFrame for easier indexing
cosine_sim_df = pd.DataFrame(cosine_sim, index=df_sampled.index, columns=df_sampled.index)

# Display the cosine similarity matrix
print(cosine_sim_df.head())

        113186  42819   59311   91368   61000   96815   18939   72760   \
113186     1.0     1.0     1.0     1.0     1.0     1.0     1.0     1.0   
42819      1.0     1.0     1.0     1.0     1.0     1.0     1.0     1.0   
59311      1.0     1.0     1.0     1.0     1.0     1.0     1.0     1.0   
91368      1.0     1.0     1.0     1.0     1.0     1.0     1.0     1.0   
61000      1.0     1.0     1.0     1.0     1.0     1.0     1.0     1.0   

        25788   87169   ...    2805    84494   51053   47932   81834   18241   \
113186     1.0     1.0  ...  0.999997     1.0     1.0     1.0     1.0     1.0   
42819      1.0     1.0  ...  0.999997     1.0     1.0     1.0     1.0     1.0   
59311      1.0     1.0  ...  0.999997     1.0     1.0     1.0     1.0     1.0   
91368      1.0     1.0  ...  0.999997     1.0     1.0     1.0     1.0     1.0   
61000      1.0     1.0  ...  0.999997     1.0     1.0     1.0     1.0     1.0   

        42766   52377   27380   75356   
113186     1.0     1.0     

In [10]:
def recommend_songs(track_index, num_recommendations=5):
    """
    Recommends songs similar to the one identified by track_index.
    Args:
        track_index (int): Index of the track to find recommendations for.
        num_recommendations (int): Number of recommendations to return.
    Returns:
        pd.DataFrame: DataFrame containing the recommended songs.
    """
    # Check if the track_index exists in the cosine_sim_df
    if track_index not in cosine_sim_df.index:
        raise ValueError(f"Track index {track_index} not found in cosine similarity DataFrame.")

    # Get similarity scores for the given track
    sim_scores = list(enumerate(cosine_sim_df.loc[track_index]))
    
    # Sort songs based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N recommendations (excluding the track itself)
    sim_scores = sim_scores[1:num_recommendations+1]
    
    # Get track indices
    track_indices = [i[0] for i in sim_scores]
    
    # Return recommended tracks
    return df_sampled.iloc[track_indices]


In [11]:
print("Available indices:", cosine_sim_df.index)

# Ensure to use a valid track index
valid_track_index = cosine_sim_df.index[0]  # Get the first valid index
print(f"Using valid track index: {valid_track_index}")

# Get recommendations
recommended_songs = recommend_songs(valid_track_index)  # Recommend songs similar to the first valid track
print("Recommended Songs:")
print(recommended_songs)

Available indices: Index([113186,  42819,  59311,  91368,  61000,  96815,  18939,  72760,  25788,
        87169,
       ...
         2805,  84494,  51053,  47932,  81834,  18241,  42766,  52377,  27380,
        75356],
      dtype='int64', length=5000)
Using valid track index: 113186
Recommended Songs:
        Unnamed: 0  popularity  duration_ms  explicit  danceability  energy  \
111569      111569        0.52     0.053210     False      0.580711   0.738   
110082      110082        0.51     0.038437     False      0.569543   0.903   
106341      106341        0.41     0.057793     False      0.372589   0.469   
110582      110582        0.51     0.029637     False      0.743147   0.966   
111626      111626        0.19     0.048152     False      0.676142   0.512   

        key  loudness  mode  speechiness  acousticness  instrumentalness  \
111569    7  0.791540     1     0.036062      0.028213            0.0274   
110082    7  0.826129     0     0.047979      0.009207            0.0