In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [3]:
df. shape

(57650, 4)

In [4]:
df = df. sample(n=5000).drop('link',axis=1).reset_index(drop=True)

In [5]:
df.isnull().sum()

artist    0
song      0
text      0
dtype: int64

In [6]:
df ['song'][0]

'Rise Again'

In [7]:
df ['text'][0]

"(instrumental intro)  \nGoing to jamaica,  \nWarm and coral waters flow  \nThere she goes  \nSun kissed her body, in this magic feel love grow,  \nLove grows  \nFeel like letting go  \nHey mr. Saturday night  \n? ? right out the hole  \nIt stops in a minute,  \nIt's right down in it  \nHey, we're really back on a roll  \nI don't know why, I don't know why  \nFeel so good I could die, I could die  \nGreat escape, I'll be there, by the skin of my teeth, on a wish and a prayer  \nWith a head full of hammers, both feet in the grave, I'm gonna stand I'm gonna\nrise again  \nDream time (lavinia? ? ),  \nEven though the odds are stacked, she's coming back  \n(moon monkeys barking? ) in a bitter chocolate glow,  \nThere she goes, feel like letting go...  \nHey, mr. cool jazz nights, let the fever go,  \nIt stops in a minute, I'm down in it  \nHey, play it long and slow  \nI don't know why, I don't know why,  \nFeels so good I could die, I could die,  \nWith a great escape, I'll be there, by t

In [8]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n','',regex=True)

In [9]:
df['text'][0]

"(instrumental intro)  going to jamaica,  warm and coral waters flow  there she goes  sun kissed her body, in this magic feel love grow,  love grows  feel like letting go  hey mr. saturday night  ? ? right out the hole  it stops in a minute,  it's right down in it  hey, we're really back on a roll  i don't know why, i don't know why  feel so good i could die, i could die  great escape, i'll be there, by the skin of my teeth, on a wish and a prayer  with a head full of hammers, both feet in the grave, i'm gonna stand i'm gonnarise again  dream time (lavinia? ? ),  even though the odds are stacked, she's coming back  (moon monkeys barking? ) in a bitter chocolate glow,  there she goes, feel like letting go...  hey, mr. cool jazz nights, let the fever go,  it stops in a minute, i'm down in it  hey, play it long and slow  i don't know why, i don't know why,  feels so good i could die, i could die,  with a great escape, i'll be there, by the skin of my teeth on a wish and aprayer  with a he

In [10]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [ps.stem(w) for w in tokens]
    return " ".join(stemming)

In [11]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [12]:
df['text']

0       ( instrument intro ) go to jamaica , warm and ...
1       lalalala now ew aw but it 's not the time to p...
2       heartbreak , feelin ' alright bodi over , mind...
3       now , everybodi tell you 're the dog 's best f...
4       i 'm not sure about tomorrow life ha few guara...
                              ...                        
4995    do you hear me , i 'm talk to you across the w...
4996    [ 50 cent ] yeah nigga ! ha ha let 's go nigga...
4997    [ choru ] storm never last , do they babi . ba...
4998    [ vers : ] broadway light and taxi cab everybo...
4999    ooh babi , ca n't you hear out favorit song oo...
Name: text, Length: 5000, dtype: object

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [14]:
tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(df['text'])

In [15]:
matrix.shape

(5000, 18109)

In [16]:
similarity = cosine_similarity(matrix)

In [17]:
def recommendation(song):
    idx = df[df['song'] == song].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=False,key=lambda x:x[1])
    songs = []
    for i in distances[1:4996]:
        songs.append(df.iloc[i[0]].song)
    return songs

In [18]:
re = recommendation('Rise Again')

In [19]:
nbrs = NearestNeighbors(n_neighbors=4995, algorithm='auto').fit(matrix)


In [20]:
distances, indices = nbrs.kneighbors(matrix[0].toarray().reshape(1, -1))

In [21]:
def recommendationNearest(song):
    recommended_indices = indices[0][0:4995]
    songs = [df.iloc[i].song for i in recommended_indices]
    
    return songs



In [22]:
rn=recommendationNearest('Rise Again')


In [23]:
from sklearn.metrics import accuracy_score, precision_score
p = precision_score(re, rn, average='micro')
a=accuracy_score(re, rn)
print(p)
print(a)

0.0002002002002002002
0.0002002002002002002


In [24]:
def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False


In [25]:
filtered_re = [float(x) for x in re if is_float(x)]
filtered_rn = [float(x) for x in rn if is_float(x)]



In [26]:
min_length = min(len(filtered_re), len(filtered_rn))
filtered_re = filtered_re[:min_length]
filtered_rn = filtered_rn[:min_length]



In [27]:
re = np.array(filtered_re)
rn = np.array(filtered_rn)



In [28]:
squared_errors = (re - rn) ** 2


In [29]:
mse = np.mean(squared_errors)


In [30]:
rmse = np.sqrt(mse)
print(f"Squared Errors: {squared_errors}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")

Squared Errors: [0.]
MSE: 0.0
RMSE: 0.0


In [31]:
from sklearn.cluster import KMeans


In [32]:
num_clusters = 4996  # Choose an appropriate number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(matrix)


  return fit_method(estimator, *args, **kwargs)


In [None]:
df['cluster'] = kmeans.labels_


In [125]:
def recommendationKMeans(song):
    # Find the cluster for the given song
    song_index = df[df['song'] == song].index[0]
    song_cluster = df.loc[song_index, 'cluster']
    
    # Get the indices of songs in the same cluster
    cluster_indices = df[df['cluster'] == song_cluster].index
    
    # Get the song names of the recommended songs
    recommended_songs = df.loc[cluster_indices, 'song'].tolist()
    
    return recommended_songs



In [126]:
rk=recommendationKMeans('A Letter To Syracuse')

In [127]:
print(len(re))

0


In [128]:
print(len(rk))

1


In [96]:
from sklearn.metrics import accuracy_score, precision_score
p = precision_score(re, rk, average='micro')
a=accuracy_score(re, rk)
print(p)
print(a)

ValueError: Found input variables with inconsistent numbers of samples: [0, 429]