In [1]:
import pandas as pd

df1 = pd.read_csv('high-energy_electronic.csv')
df2 = pd.read_csv('chill_indie.csv')
df3 = pd.read_csv('slow_sad_acoustic.csv')
df4 = pd.read_csv('danceable_pop_vibes.csv')

df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [2]:
print(f"Original dataset size: {len(df)}")
df_clean = df.drop_duplicates(subset=['track_name', 'artists'], keep='first')
print(f"After removing duplicates: {len(df_clean)}")

Original dataset size: 6468
After removing duplicates: 5519


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

recommendation_features = [
    'valence', 'energy', 'danceability', 'tempo', 'acousticness',
    'instrumentalness', 'liveness', 'loudness', 'speechiness',
    'duration_ms', 'key', 'mode', 'time_signature'
]

reco_data = df[recommendation_features]
reco_scaled = StandardScaler().fit_transform(reco_data)

similarity_matrix = cosine_similarity(reco_scaled)

def normalize(text):
    return ' '.join(text.lower().strip().split())

def recommend_songs(track_name, artist, df, similarity_matrix, top_n=5):
    idx = df[(df['track_name'].str.lower() == track_name.lower()) & 
             (df['artists'].str.lower().str.contains(artist.lower()))].index

    if len(idx) == 0:
        print("Song not found.")
        return

    idx = idx[0]
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    print(f"\nBecause you liked: {df.loc[idx, 'track_name']} by {df.loc[idx, 'artists']}")
    print("\nRecommended Songs:\n")

    count = 0
    for i, score in similarity_scores[1:]:
        song = df.loc[i, ['track_name', 'artists', 'playlist_name']]
        print(f"→ {song['track_name']} — {song['artists']} ({song['playlist_name']})")
        count += 1
        if count == top_n:
            break

def recommend_songs(track_name, artist, df, similarity_matrix, top_n=5, min_similarity = 0.1):
    idx = df[(df['track_name'].str.lower() == track_name.lower()) & 
             (df['artists'].str.lower().str.contains(artist.lower()))].index

    if len(idx) == 0:
        print("Song not found.")
        return

    idx = idx[0]
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    print(f"\nBecause you liked: {df.loc[idx, 'track_name']} by {df.loc[idx, 'artists']}")
    print("\nRecommended Songs:\n")

    seen = set()
    count = 0
    for i, score in similarity_scores[1:]:
        if score < min_similarity:  # Filtering out very low similarity scores
            continue
            
        song = df.loc[i]
        key = normalize(song['track_name'])

        if key in seen:
            continue
        
        seen.add(key)

        print(f"→ {song['track_name']} — {song['artists']} ({song['playlist_name']})")
        count += 1
        if count == top_n:
            break  

recommend_songs("cheap thrills", "sia", df, similarity_matrix, top_n=5)



Because you liked: Cheap Thrills by Sia

Recommended Songs:

→ I Took A Pill In Ibiza - Seeb Remix — Mike Posner;Seeb (Danceable Pop Vibes)
→ Phulkari — Karan Randhawa (Danceable Pop Vibes)
→ Levitating (feat. DaBaby) — Dua Lipa;DaBaby (Danceable Pop Vibes)
→ Rhinestone Eyes — Gorillaz (Danceable Pop Vibes)
→ Dynamite — BTS (Danceable Pop Vibes)
