In [19]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [20]:
df = pd.read_csv(r'C:\Users\Hp\OneDrive\Desktop\MRS\Song_Recommendation\Datasets\spotify_millsongdata.csv')

In [21]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [22]:
df.drop('link', inplace=True, axis=1)

In [23]:
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [24]:
df.shape

(57650, 3)

In [25]:
df = df.sample(8000)

In [26]:
df.isnull().sum()

artist    0
song      0
text      0
dtype: int64

In [27]:
df.loc[:, 'text'] = df['text'].apply(lambda x: x.lower().replace('\r','').replace('\n',''))

In [28]:
# Initialize PorterStemmer
porter = PorterStemmer()

# Get the set of English stopwords
stop_words = set(stopwords.words('english'))

# Define a function for preprocessing a sentence
def preprocess_sentence(sentence):
    # Tokenize the input sentence into words
    words = word_tokenize(sentence)
    
    # Stem each word using PorterStemmer and filter out stopwords
    stemmed_words = [porter.stem(word) for word in words if word.lower() not in stop_words]
    
    # Join the stemmed words back into a single string
    return ' '.join(stemmed_words)

In [29]:
df.loc[:, 'text'] = df['text'].apply(preprocess_sentence)

In [30]:
df.loc[:, 'text'] = df['text'].apply(lambda x: x.replace("'", ''))

In [31]:
df.reset_index(drop=True, inplace=True)

In [32]:
df.head()

Unnamed: 0,artist,song,text
0,Megadeth,Never Dead,killer lie wait innoc blood swallow live harml...
1,Moody Blues,House Of Four Doors (Part 2),walk thru door outsid came nowher perhap answe...
2,Fall Out Boy,Nobody Puts Baby In The Corner,drink gin kerosen come spit bridg keep us warm...
3,Noa,All Is Well,"anoth day , anoth small town summer breez boug..."
4,Carly Simon,My Bonnie,"bonni lie ocean , bonni lie sea , bonni lie oc..."


In [33]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])

In [34]:
similarity_matrix = cosine_similarity(tfidf_matrix)

In [35]:
def recommendation(song_df):
    # Check if the song exists in the DataFrame
    if song_df not in df['song'].values:
        print(f"The song '{song_df}' does not exist in the DataFrame.")
        return []

    # Find the index of the song in the DataFrame
    idx = df[df['song'] == song_df].index[0]
    
    # Sort the similarity scores in descending order
    distances = sorted(list(enumerate(similarity_matrix[idx])), reverse=True, key=lambda x: x[1])
    
    # Get the indices of the top 5 most similar songs
    recommended_indices = [m_id[0] for m_id in distances[1:6]]
    
    # Get the titles of the recommended songs
    recommended_songs = df.iloc[recommended_indices]['song'].tolist()
    
    return recommended_songs


In [38]:
recommended_songs = recommendation("My Bonnie")
print("Recommended songs:")
for song in recommended_songs:
    print(song)

Recommended songs:
Truck
Oceans Of Fantasy
Never Been Gone
Wind Up
Wicked Games


In [39]:
pickle.dump(similarity_matrix,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))