In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from pprint import pprint  # Pretty-print for cleaner output

# --- Load & Merge Datasets ---
print("📂 Loading data... Please wait.")
spotify_df = pd.read_csv("spotify_millsongdata.csv")
music_info_df = pd.read_csv("Music Info.csv")

print("🔗 Combining datasets...")
merged_df = pd.merge(
    spotify_df, music_info_df,
    left_on=["artist", "song"],
    right_on=["artist", "name"],
    how="inner"
)

# Drop unnecessary columns
columns_to_remove = ["name", "link", "track_id", "spotify_id", "spotify_preview_url"]
merged_df.drop(columns=columns_to_remove, inplace=True)

# --- Handle Missing Data ---
print("🔄 Cleaning missing values...")
merged_df["genre"] = merged_df["genre"].fillna("Unknown")
merged_df["tags"] = merged_df["tags"].fillna("").apply(lambda x: x.split(", "))

# --- Normalize Numerical Features ---
numerical_cols = [
    "danceability", "energy", "loudness", "speechiness", "acousticness",
    "instrumentalness", "liveness", "valence", "tempo", "duration_ms"
]

print("📊 Scaling numerical features...")
scaler = MinMaxScaler()
merged_df[numerical_cols] = scaler.fit_transform(merged_df[numerical_cols])

# --- Lyrics-Based Recommendation ---
print("🔠 Processing song lyrics for recommendations...")
tfidf = TfidfVectorizer(stop_words="english")
lyrics_matrix = tfidf.fit_transform(merged_df["text"])
lyrics_similarity = cosine_similarity(lyrics_matrix, lyrics_matrix)

def recommend_songs_by_lyrics(song_title, num_recommendations=5):
    """Finds similar songs based on lyrics."""
    if song_title not in merged_df["song"].values:
        return f"⚠️ Sorry, '{song_title}' is not in the database."
    
    idx = merged_df[merged_df["song"] == song_title].index[0]
    similarity_scores = sorted(
        list(enumerate(lyrics_similarity[idx])),
        key=lambda x: x[1],
        reverse=True
    )[1:num_recommendations+1]
    
    return [merged_df.iloc[i[0]][["artist", "song"]].to_dict() for i in similarity_scores]

# --- Feature-Based Recommendation ---
print("🎵 Analyzing song features for recommendations...")
song_features = merged_df[numerical_cols].values
knn = NearestNeighbors(n_neighbors=6, metric="cosine")
knn.fit(song_features)

def recommend_songs_by_features(song_title, num_recommendations=5):
    """Finds similar songs based on musical features."""
    if song_title not in merged_df["song"].values:
        return f"⚠️ '{song_title}' was not found."
    
    idx = merged_df[merged_df["song"] == song_title].index[0]
    _, indices = knn.kneighbors([song_features[idx]])

    return [merged_df.iloc[i][["artist", "song"]].to_dict() for i in indices[0][1:num_recommendations+1]]

# --- Genre-Based Recommendation ---
print("🎼 Grouping songs by genre clusters...")
genre_mapping = {genre: idx for idx, genre in enumerate(merged_df["genre"].unique())}
merged_df["genre_code"] = merged_df["genre"].map(genre_mapping)

kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
merged_df["cluster"] = kmeans.fit_predict(song_features)

def recommend_songs_by_genre(song_title, num_recommendations=5):
    """Suggests songs from the same genre cluster."""
    if song_title not in merged_df["song"].values:
        return f"⚠️ '{song_title}' is not in the database."
    
    song_cluster = merged_df.loc[merged_df["song"] == song_title, "cluster"].values[0]
    similar_songs = merged_df[merged_df["cluster"] == song_cluster].sample(n=num_recommendations, random_state=42)
    
    return similar_songs[["artist", "song"]].to_dict(orient="records")

# --- Sample Songs to Test ---
sample_songs = [
    "Dancing Queen", "Bohemian Rhapsody", "Hotel California", "Stairway to Heaven",
    "Shake It Off", "Someone Like You", "Blinding Lights", "Shape of You",
    "Lose Yourself", "Sicko Mode", "Stronger", "Titanium", "Animals", "Radioactive"
]

# --- Generate Recommendations for Each Song ---
for song in sample_songs:
    print(f"\n🎶 === Recommendations for: {song} ===\n")
    
    # Lyrics-Based Recommendations
    lyrics_recs = recommend_songs_by_lyrics(song, 5)
    print("📖 Based on Lyrics:")
    pprint(lyrics_recs)
    
    # Feature-Based Recommendations
    feature_recs = recommend_songs_by_features(song, 5)
    print("\n🎧 Based on Musical Features:")
    pprint(feature_recs)
    
    # Genre-Based Recommendations
    genre_recs = recommend_songs_by_genre(song, 5)
    print("\n🎼 Based on Genre Similarity:")
    pprint(genre_recs)
    
    print("=" * 60)


📂 Loading data... Please wait.
🔗 Combining datasets...
🔄 Cleaning missing values...
📊 Scaling numerical features...
🔠 Processing song lyrics for recommendations...
🎵 Analyzing song features for recommendations...
🎼 Grouping songs by genre clusters...

🎶 === Recommendations for: Dancing Queen ===

📖 Based on Lyrics:
[{'artist': 'Leo Sayer', 'song': 'You Make Me Feel Like Dancing'},
 {'artist': 'Black Sabbath', 'song': 'Air Dance'},
 {'artist': 'Xandria', 'song': 'Dancer'},
 {'artist': 'High School Musical', 'song': 'Can I Have This Dance'},
 {'artist': 'Justin Bieber', 'song': 'First Dance'}]

🎧 Based on Musical Features:
[{'artist': 'Jimi Hendrix', 'song': 'Long Hot Summer Night'},
 {'artist': 'Dave Matthews Band', 'song': 'Rapunzel'},
 {'artist': 'Steely Dan', 'song': 'Kid Charlemagne'},
 {'artist': 'Supertramp', 'song': 'The Logical Song'},
 {'artist': 'Violent Femmes', 'song': 'Country Death Song'}]

🎼 Based on Genre Similarity:
[{'artist': 'Dave Matthews Band', 'song': 'Lover Lay D