In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('spotify_millsongdata.csv')

In [3]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.describe()

Unnamed: 0,artist,song,link,text
count,57650,57650,57650,57650
unique,643,44824,57650,57494
top,Donna Summer,Have Yourself A Merry Little Christmas,/z/zwan/heartsong_20148991.html,I just came back from a lovely trip along the ...
freq,191,35,1,6


In [5]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [6]:
df.shape

(57650, 4)

In [7]:
df = df.drop('link', axis=1).reset_index(drop = True)

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources (only once)
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# Function to clean text
def clean_lyrics(text):
    if not isinstance(text, str):  # Handle NaN values
        return ""

    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Remove stopwords and lemmatize
    return ' '.join(words)

In [10]:
df['cleaned_text'] = df['text'].apply(clean_lyrics)

In [11]:
df[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,"Look at her face, it's a wonderful face \r\nA...",look face wonderful face mean something specia...
1,"Take it easy with me, please \r\nTouch me gen...",take easy please touch gently like summer even...
2,I'll never know why I had to go \r\nWhy I had...,never know go put lousy rotten show boy tough ...
3,Making somebody happy is a question of give an...,making somebody happy question give take learn...
4,Making somebody happy is a question of give an...,making somebody happy question give take learn...


In [12]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

CLIENT_ID = '131445af07dd464a9e8bac6661798100'
CLIENT_SECRET = 'e1ea22f8bea9408480855274f2d4b8e8'

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(CLIENT_ID, CLIENT_SECRET))

In [20]:
def get_artist_genre(df):
    artists = df['artist'].unique()

    artist_genre_map = {}
    for artist in artists:
        result = sp.search(artist, type='artist')
        if result['artists']['items']:  # Check if the artist exists
            artist_id = result['artists']['items'][0]['id']
            artist_info = sp.artist(artist_id)
            artist_genre_map[artist] = artist_info.get('genres')
        else:
            artist_genre_map[artist] = []
    
    df['genre'] = df['artist'].map(artist_genre_map)
    return df


In [21]:
df = get_artist_genre(df)

df

Unnamed: 0,artist,song,text,cleaned_text,genre
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA...",look face wonderful face mean something specia...,[]
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen...",take easy please touch gently like summer even...,[]
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...,never know go put lousy rotten show boy tough ...,[]
3,ABBA,Bang,Making somebody happy is a question of give an...,making somebody happy question give take learn...,[]
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,making somebody happy question give take learn...,[]
...,...,...,...,...,...
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...,irie day come play let angel fly let devil die...,"[reggae, roots reggae]"
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...,power worker power power worker need power pow...,"[reggae, roots reggae]"
57647,Zwan,Come With Me,all you need \r\nis something i'll believe \...,need something believe flashlight hall call ca...,[]
57648,Zwan,Desire,northern star \r\nam i frightened \r\nwhere ...,northern star frightened go rest sleep still f...,[]


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=6)
model.fit(tfidf_matrix)

In [16]:
def recommend_songs(song_name, n=5):
    idx = df[df['song'].str.lower() == song_name.lower()].index

    if len(idx) == 0:
        return "Song not found in the dataset."
    
    idx = idx[0]  # Get the index of the song
    
    # Find nearest neighbors (most similar songs)
    distances, indices = model.kneighbors(tfidf_matrix[idx], n_neighbors=n+1)
    
    # Get recommended song indices (excluding the input song)
    song_indices = indices[0][1:]
    
    return df[['artist', 'song']].iloc[song_indices]

In [17]:
recommend_songs("As Good As New")

Unnamed: 0,artist,song
43654,Michael Buble,Feeling Good
18919,Stevie Wonder,I'm New
45850,Nina Simone,Feeling Good
50387,Reba Mcentire,A New Love
56291,Whitney Houston,You Give Good Love


In [18]:
def check_recommendation_score(song_name, n=5):
    idx = df[df['song'].str.lower() == song_name.lower()].index

    if len(idx) == 0:
        return "Song not found in the dataset."
    
    idx = idx[0]  # Get the index of the song
    
    # Find nearest neighbors
    distances, indices = model.kneighbors(tfidf_matrix[idx], n_neighbors=n+1)
    
    # Exclude the input song itself
    sim_scores = 1 - distances[0][1:]  # Convert distances to similarity scores
    
    # Get recommended songs
    recommended_songs = df.iloc[indices[0][1:]][['artist', 'song']]
    
    # Add similarity scores to the output
    recommended_songs['similarity_score'] = sim_scores
    
    # Compute average similarity score
    avg_score = np.mean(sim_scores)
    
    return recommended_songs, f"Average Similarity Score: {avg_score:.4f}"

# Example: Check similarity scores for recommendations
check_recommendation_score("As Good As New")

(                artist                song  similarity_score
 43654    Michael Buble        Feeling Good          0.488091
 18919    Stevie Wonder             I'm New          0.466993
 45850      Nina Simone        Feeling Good          0.432909
 50387    Reba Mcentire          A New Love          0.432399
 56291  Whitney Houston  You Give Good Love          0.410605,
 'Average Similarity Score: 0.4462')