In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download the 'punkt' resource
nltk.download('punkt')


In [23]:
df = pd.read_csv('songdata.csv')
df = df.drop('link',axis = 1)
df = df.sample(n = 7500).reset_index(drop = True)
df['text'] = df['text'].str.lower()
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]' and '\n' , ' ', x))
# df['text'][0]

In [None]:
ps = PorterStemmer()

def tokenization(txt):
  tokens = nltk.word_tokenize(txt)
  stemmed_tokens = [ps.stem(token) for token in tokens]
  return ' '.join(stemmed_tokens)

df['text'] = df['text'].apply(lambda x: tokenization(x))


In [26]:
df['song'][100]

'Take Your Whiskey Home'

In [7]:
tfidf = TfidfVectorizer(stop_words = "english")
vectors = tfidf.fit_transform(df['text']).toarray()
similarity = cosine_similarity(vectors)
similarity

array([[1.        , 0.00353273, 0.00435012, ..., 0.00112588, 0.05172253,
        0.00919988],
       [0.00353273, 1.        , 0.08279073, ..., 0.        , 0.00926981,
        0.03336271],
       [0.00435012, 0.08279073, 1.        , ..., 0.        , 0.01611317,
        0.00502526],
       ...,
       [0.00112588, 0.        , 0.        , ..., 1.        , 0.        ,
        0.00889851],
       [0.05172253, 0.00926981, 0.01611317, ..., 0.        , 1.        ,
        0.02264259],
       [0.00919988, 0.03336271, 0.00502526, ..., 0.00889851, 0.02264259,
        1.        ]])

In [20]:
def recommendation(songName):
  idx = df[df['song'] == songName].index[0]
  distances = sorted(list(enumerate(similarity[idx])),key = lambda x:x[0])
  recommendations = []
  for i in distances[1:26]:
    recommendations.append(df.iloc[i[0]].song)
  return recommendations

In [27]:
recommendation('Take Your Whiskey Home')


['Death Is A Star',
 'Song For Dad',
 'Feeling Stronger Everyday',
 'Brain Tap Shuffle',
 'Midnight',
 'Mystic Journey',
 "Livin' In The Future",
 'Long Line Of Cars',
 'A Thousand Lies',
 'Between The Lines',
 'Oh How That German Could Love!',
 'Put Your Hands Up',
 'Spokane Motel Blues',
 'Flame',
 'Valhalla',
 'Kissing You Goodbye',
 'Moonlight Serenade',
 'No Place Like Home',
 'Waving Not Drowning',
 'Canadian Railroad Trilogy',
 'Maidstone',
 'Anything',
 'Faultline',
 'Just You And Me',
 'Got To Me']

In [29]:
import pickle as pickel
pickel.dump(df,open('df.pkl','wb'))
pickel.dump(similarity,open('similarity.pkl','wb'))