In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df = pd.read_csv('high_popularity_spotify_data.csv')

# Show basic info
print("Columns:", df.columns)
print("Sample data:\n", df.head())

# Drop duplicates (optional)
df = df.drop_duplicates(subset='track_name')

# Select numerical features for similarity
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
            'instrumentalness', 'liveness', 'valence', 'tempo']

# Drop rows with missing values in selected features
df = df.dropna(subset=features)

# Normalize the feature values
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features])

# Calculate cosine similarity between songs
similarity_matrix = cosine_similarity(scaled_features)

# Recommendation function
def recommend(song_name, df, similarity_matrix, n=5):
    if song_name not in df['track_name'].values:
        return f"'{song_name}' not found in dataset."

    idx = df[df['track_name'] == song_name].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]

    recommendations = df.iloc[[i[0] for i in sim_scores]][['track_name', 'artist_name']]
    return recommendations.reset_index(drop=True)

# üîç Example: Recommend similar songs
song_to_search = "Blinding Lights"  # Change this to any song in your dataset
print(f"\nüéß Recommendations for: {song_to_search}")
print(recommend(song_to_search, df, similarity_matrix))


Columns: Index(['energy', 'tempo', 'danceability', 'playlist_genre', 'loudness',
       'liveness', 'valence', 'track_artist', 'time_signature', 'speechiness',
       'track_popularity', 'track_href', 'uri', 'track_album_name',
       'playlist_name', 'analysis_url', 'track_id', 'track_name',
       'track_album_release_date', 'instrumentalness', 'track_album_id',
       'mode', 'key', 'duration_ms', 'acousticness', 'id', 'playlist_subgenre',
       'type', 'playlist_id'],
      dtype='object')
Sample data:
    energy    tempo  danceability playlist_genre  loudness  liveness  valence  \
0   0.592  157.969         0.521            pop    -7.777     0.122    0.535   
1   0.507  104.978         0.747            pop   -10.171     0.117    0.438   
2   0.808  108.548         0.554            pop    -4.169     0.159    0.372   
3   0.910  112.966         0.670            pop    -4.070     0.304    0.786   
4   0.783  149.027         0.777            pop    -4.477     0.355    0.939   

     

In [None]:
print(df['track_name'].sample(10))


192                                   Love Sosa
1256                       A Horse with No Name
1402                                      Santa
104                                      Africa
55                       Dreams - 2004 Remaster
1371                                 Una Locura
1659                                 Love Story
843     Dance The Night - From Barbie The Album
1581                            Russian Bandana
544                                       Bones
Name: track_name, dtype: object


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df = pd.read_csv('high_popularity_spotify_data.csv')

# Display sample song names
print("üéµ Sample songs to try:")
print(df['track_name'].dropna().sample(10).values)

# Drop duplicates just in case
df = df.drop_duplicates(subset='track_name')

# Select features for similarity
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
            'instrumentalness', 'liveness', 'valence', 'tempo']

# Clean data
df = df.dropna(subset=features)

# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features])

# Cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Recommendation function
def recommend(song_name, df, similarity_matrix, n=5):
    if song_name not in df['track_name'].values:
        return f"‚ö†Ô∏è '{song_name}' not found in dataset. Please try another song."

    idx = df[df['track_name'] == song_name].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]

    recommendations = df.iloc[[i[0] for i in sim_scores]][['track_name', 'track_artist']]
    return recommendations.reset_index(drop=True)

# üéØ Try it with a real song from sample list above
song_to_search = "Santa"  # replace this with an actual song name from your data
print(f"\nüéß Recommendations for: {song_to_search}")
print(recommend(song_to_search, df, similarity_matrix))


üéµ Sample songs to try:
['Who' 'Just Like Heaven' 'Brown Munde' 'Agar Tum Saath Ho' 'Mientes'
 'Waves - Robin Schulz Radio Edit' '2 hands'
 'Trance (with Travis Scott & Young Thug)' 'Light Year (Practice)'
 'In My Room']

üéß Recommendations for: Santa
                                       track_name              track_artist
0                              Paint The Town Red                  Doja Cat
1                                  DEVIL IS A LIE             Tommy Richman
2                             Insane in the Brain              Cypress Hill
3                                 Buffalo Soldier  Bob Marley & The Wailers
4  Still Not a Player (feat. Joe) - Radio Version              Big Pun, Joe
