In [1]:
!pip install nltk
!pip install pandarallel

from IPython.display import clear_output
clear_output(wait=True)
print("Installed nltk, pandarallel")

Installed nltk, pandarallel


In [2]:
import nltk
nltk.download('popular', halt_on_error=False)

clear_output(wait=True)
print("Downloaded popular nltk collections")

Downloaded popular nltk collections


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import re
from re import sub

In [4]:
from pandarallel import pandarallel
import multiprocessing
num_processors = multiprocessing.cpu_count()
workers = num_processors-1
print(f'Available CPUs: {num_processors}')

Available CPUs: 16


In [5]:
from google.cloud import storage
storage_client = storage.Client()
bucket = storage_client.get_bucket('12372233_nlp_bucket')
blob = bucket.blob('df_nlp2.csv')
path = "gs://12372233_nlp_bucket/data_mining_lyrics/df_nlp2.csv"

In [6]:
lyrics = pd.read_csv(path)

In [7]:
lyrics.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,track_name,artists,lyrics,language,lyrics_state,Song_id,url,release_date,track_id
0,0,0,2,To Begin Again,Ingrid Michaelson;ZAYN,TranslationsNorsk (bokmål / riksmål)EspañolBos...,en,complete,6597928.0,https://genius.com/Ingrid-michaelson-and-zayn-...,"March 17, 2021",1iJBSr7s7jYXzM8EGcbK5b
1,1,1,3,Can't Help Falling In Love,Kina Grannis,Can’t Help Falling in Love Lyrics[Verse 1]\nWi...,en,complete,3890906.0,https://genius.com/Kina-grannis-cant-help-fall...,"January 13, 2017",6lfxq3CG4xtTiEg7opyCyx


In [8]:
lyrics.shape

(35015, 12)

In [9]:
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [10]:
%%time 
lyrics['clean_lyrics'] = lyrics['lyrics'].parallel_apply(lambda x: sub(r'[\|\t\n\r]','',x))

CPU times: user 1.84 s, sys: 2.65 s, total: 4.49 s
Wall time: 5.11 s


In [11]:
%%time 
lyrics['clean_lyrics'] = lyrics['clean_lyrics'].parallel_apply(lambda x: sub('[\S ]+ Lyrics','',x, count = 1))

CPU times: user 3.02 s, sys: 3.42 s, total: 6.44 s
Wall time: 6.39 s


In [12]:
%%time 
lyrics['clean_lyrics'] = lyrics['clean_lyrics'].parallel_apply(lambda x: sub('[\d]*Embed','',x, count = 1))

CPU times: user 3.4 s, sys: 4.45 s, total: 7.85 s
Wall time: 8.83 s


In [13]:
%%time 
lyrics['clean_lyrics'] = lyrics['clean_lyrics'].parallel_apply(lambda x: sub('\[Pre\-Chorus[\w:& ,]*\]','',x))

CPU times: user 2.99 s, sys: 4.6 s, total: 7.59 s
Wall time: 7.31 s


In [14]:
%%time 
lyrics['clean_lyrics'] = lyrics['clean_lyrics'].parallel_apply(lambda x: sub('\[Chorus[\w:& ,]*\]','',x))

CPU times: user 2.85 s, sys: 4.57 s, total: 7.42 s
Wall time: 7 s


In [15]:
%%time 
lyrics['clean_lyrics'] = lyrics['clean_lyrics'].parallel_apply(lambda x: sub('\[Verse[\w:& ,]*\]','',x))

CPU times: user 2.99 s, sys: 4.58 s, total: 7.57 s
Wall time: 7.22 s


In [16]:
%%time 
lyrics['clean_lyrics'] = lyrics['clean_lyrics'].parallel_apply(lambda x: sub('\[Bridge[\w:& ,]*\]','',x))

CPU times: user 2.97 s, sys: 4.49 s, total: 7.45 s
Wall time: 7.04 s


In [17]:
%%time 
lyrics['clean_lyrics'] = lyrics['clean_lyrics'].parallel_apply(lambda x: sub('\[Outro[\w:& ,]*\]','',x))

CPU times: user 2.86 s, sys: 4.62 s, total: 7.48 s
Wall time: 7.15 s


In [18]:
%%time 
lyrics['clean_lyrics'] = lyrics['clean_lyrics'].parallel_apply(lambda x: sub('\[Intro[\w:& ,]*\]','',x))

CPU times: user 2.91 s, sys: 4.6 s, total: 7.51 s
Wall time: 6.99 s


In [19]:
%%time 
lyrics['clean_lyrics'] = lyrics['clean_lyrics'].parallel_apply(lambda x: sub(r'\s+',' ',x))

CPU times: user 3.32 s, sys: 4.59 s, total: 7.91 s
Wall time: 14.9 s


In [20]:
lyrics.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,track_name,artists,lyrics,language,lyrics_state,Song_id,url,release_date,track_id,clean_lyrics
0,0,0,2,To Begin Again,Ingrid Michaelson;ZAYN,TranslationsNorsk (bokmål / riksmål)EspañolBos...,en,complete,6597928.0,https://genius.com/Ingrid-michaelson-and-zayn-...,"March 17, 2021",1iJBSr7s7jYXzM8EGcbK5b,"When the world was ending, I'd hold you in my ..."


In [21]:
def tokenize_and_clean(text, stopwords):
    
    import nltk
    import re
    
    #Tokenize
    tokens = nltk.word_tokenize(text)
    
    # Remove single-character tokens
    tokens = [token for token in tokens if len(token) > 1]

    # Remove numbers
    tokens = [token for token in tokens if not token.isnumeric()]

    # Remove punctuation
    tokens = [token for token in tokens if token.isalpha()]

    # Lowercase all tokens (default_stoptokens are lowercase too)
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords]

    # Remove custom stopwords
    #tokens = [token for token in tokens if token not in custom_stopwords]

   #Lemmatize
    wnl = WordNetLemmatizer()
    tokens = [wnl.lemmatize(token) for token in tokens]
    
    clean_text = ' '.join(token for token in tokens)

    return clean_text

In [22]:
%%time

lyrics['final_lyrics'] = lyrics['clean_lyrics'].parallel_apply(tokenize_and_clean,
                                                       stopwords = set(stopwords.words('english')))

CPU times: user 2.67 s, sys: 3.26 s, total: 5.92 s
Wall time: 4min 21s


In [23]:
lyrics.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,track_name,artists,lyrics,language,lyrics_state,Song_id,url,release_date,track_id,clean_lyrics,final_lyrics
0,0,0,2,To Begin Again,Ingrid Michaelson;ZAYN,TranslationsNorsk (bokmål / riksmål)EspañolBos...,en,complete,6597928.0,https://genius.com/Ingrid-michaelson-and-zayn-...,"March 17, 2021",1iJBSr7s7jYXzM8EGcbK5b,"When the world was ending, I'd hold you in my ...",world ending hold armsand talked place never b...
1,1,1,3,Can't Help Falling In Love,Kina Grannis,Can’t Help Falling in Love Lyrics[Verse 1]\nWi...,en,complete,3890906.0,https://genius.com/Kina-grannis-cant-help-fall...,"January 13, 2017",6lfxq3CG4xtTiEg7opyCyx,Wise men sayOnly fools rush inBut I can't help...,wise men sayonly fool rush inbut ca help falli...
2,2,2,4,Hold On,Chord Overstreet,Hold On Lyrics[Verse 1]\nLoving and fighting\n...,en,complete,2973448.0,https://genius.com/Chord-overstreet-hold-on-ly...,"February 2, 2017",5vjLSffimiIP26QG5WcN2K,"Loving and fightingAccusing, denyingI can't im...",loving fightingaccusing denyingi ca imagine wo...
3,3,3,5,Days I Will Remember,Tyrone Wells,Days I will Remember LyricsThese are the days ...,en,complete,3805340.0,https://genius.com/Tyrone-wells-days-i-will-re...,"April 20, 2018",01MVOl9KtVTNfFiBU9I7dc,These are the days I will rememberThese are th...,day rememberthese face need mosteverything cha...
4,4,4,6,Say Something,A Great Big World;Christina Aguilera,TranslationsPortuguêsSay Something Lyrics[Intr...,en,complete,255826.0,https://genius.com/A-great-big-world-say-somet...,"November 4, 2013",6Vc5wAMmXdKIAM7WUoEb7N,"Say something, I'm giving up on youI'll be the...",say something giving youi one want toanywhere ...


#### Save lyrics df

In [24]:
# lyrics.to_csv('gs://12372233_nlp_bucket/data_mining_lyrics/cleaned_lyrics.csv', index=False)

#### Load lyrics df

In [25]:
# lyrics = pd.read_csv('gs://12372233_nlp_bucket/data_mining_lyrics/cleaned_lyrics.csv')

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
%%time

vectorizer_ohe = CountVectorizer(min_df=2).fit_transform(lyrics['final_lyrics'])

CPU times: user 1min 4s, sys: 667 ms, total: 1min 5s
Wall time: 1min 5s


In [28]:
%%time

cosine_similarities_ohe = cosine_similarity(vectorizer_ohe)

CPU times: user 4min 1s, sys: 6.52 s, total: 4min 7s
Wall time: 4min 7s


In [29]:
cosine_similarities_ohe[0:5]

array([[1.        , 0.10241588, 0.08035584, ..., 0.16482442, 0.07740315,
        0.01126283],
       [0.10241588, 1.        , 0.12968894, ..., 0.00825516, 0.01398757,
        0.05020437],
       [0.08035584, 0.12968894, 1.        , ..., 0.12898224, 0.03708691,
        0.00713104],
       [0.14058277, 0.02464361, 0.11201232, ..., 0.10026541, 0.01057093,
        0.00813028],
       [0.07765689, 0.04326973, 0.06555781, ..., 0.03012376, 0.03402785,
        0.01665453]])

In [30]:
# storage_client = storage.Client()
# bucket = storage_client.get_bucket('12372233_nlp_bucket')
# blob = bucket.blob('df_nlp1.csv')
# path = "gs://12372233_nlp_bucket/data_mining_lyrics/df_nlp1.csv"
# lyrics = pd.read_csv(path)

In [31]:
# !pip install tensorflow

In [32]:
# from tensorflow.python.lib.io import file_io

In [33]:
# np.save(file_io.FileIO('gs://12372233_nlp_bucket/data_mining_lyrics/cosine_similarities.npy', 'w'), cosine_similarities_ohe)

In [34]:
# np.save('gs://12372233_nlp_bucket/data_mining_lyrics/cosine_similarities.npy', cosine_similarities_ohe)    # .npy extension is added if not given

#### Load 

In [35]:
# cosine_similarities_ohe_loaded = np.load('gs://12372233_nlp_bucket/data_mining_lyrics/cosine_similarities.npy')

In [36]:
def get_recommendations_ohe(track_id, cosine_similarities=cosine_similarities_ohe):
    # Get the index of the track with the given ID
    idx = lyrics[lyrics['track_id'] == track_id].index[0]
    
    # Get the cosine similarities between the given track and all other tracks
    sim_scores = list(enumerate(cosine_similarities[idx]))
    
    # Sort the tracks by their similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top 10 most similar tracks
    sim_scores = sim_scores[0:11]
    
    # Get the indices and titles of the top 10 most similar tracks
    track_indices = [i[0] for i in sim_scores]
    track_names = lyrics['track_name'].iloc[track_indices].values
    artist_names = lyrics['artists'].iloc[track_indices].values
#    track_id = lyrics['track_id'].iloc[track_indices].values
#     genre = lyrics['Genre_Combined'].iloc[track_indices].values
    
    recommendations_df = pd.DataFrame({'track_name': track_names, 'artist': artist_names #, 'genre': genre
                                       , 'similarity_score': [i[1] for i in sim_scores]})
    
    return recommendations_df

In [37]:
get_recommendations_ohe('1iJBSr7s7jYXzM8EGcbK5b')

Unnamed: 0,track_name,artist,similarity_score
0,To Begin Again,Ingrid Michaelson;ZAYN,1.0
1,Our Fragment,Skrux;MISSIO,0.421644
2,I Don't Know Why,Alison Krauss & Union Station,0.419574
3,Do You Know? (The Ping Pong Song),Enrique Iglesias,0.418346
4,I Know How to Party,The Dollyrots,0.411242
5,Know Your Darkness,Faderhead,0.409719
6,I Know You Love Me,Smoking Popes,0.408068
7,If You Really Love Me (How Will I Know),David Guetta;MistaJam;John Newman,0.401424
8,You Should Know,Jor'dan Armstrong,0.396137
9,Wintertime,Norah Jones,0.394774


In [38]:
get_recommendations_ohe('6lfxq3CG4xtTiEg7opyCyx')

Unnamed: 0,track_name,artist,similarity_score
0,Can't Help Falling In Love,Kina Grannis,1.0
1,Can't Help Falling in Love,Christian Leave,0.920261
2,(I Can't Help) Falling In Love With You,UB40,0.815948
3,Can't Help Falling in Love,Elvis Presley,0.65914
4,Can't Help Falling In Love,Elvis Presley,0.65914
5,Can't Help Falling In Love - Live At Lake Las ...,Andrea Bocelli,0.642553
6,I Can't Help,Shoffy;Sarcastic Sounds,0.590009
7,i can't help it,JVKE,0.586957
8,Would That Make You Love Me?,SHRK;Shiloh Dynasty,0.562496
9,Deathcab,Ditty,0.542245


In [42]:
get_recommendations_ohe('5IfCZDRXZrqZSm8AwE44PG')

Unnamed: 0,track_name,artist,similarity_score
0,Winter Wonderland,Jason Mraz,1.0
1,Winter Wonderland,Dean Martin,0.705559
2,Winter Wonderland,The Platters,0.663316
3,Winter Wonderland - Single Version,Louis Armstrong,0.654383
4,Winter Wonderland,Tony Bennett,0.63971
5,Winter Wonderland,Ella Fitzgerald,0.639265
6,Winter Wonderland,Rod Stewart;Michael Bublé,0.637411
7,Winter Wonderland,Tony Bennett;Lady Gaga,0.621628
8,Winter Wonderland,Sugarland,0.594362
9,Walking Away,Craig David,0.325065


In [40]:
get_recommendations_ohe('5vjLSffimiIP26QG5WcN2K')

Unnamed: 0,track_name,artist,similarity_score
0,Hold On,Chord Overstreet,1.0
1,Hold On - Acoustic,Chord Overstreet,0.991189
2,Hold On - Remix,Chord Overstreet;Deepend,0.861057
3,You Make Me Feel,Bonfire,0.469058
4,Is There Still Anything That Love Can Do? (Eng...,RADWIMPS,0.466439
5,Still In Love,GhostDragon;YERINMYWAY;Nate Mitchell,0.442861
6,Ferrari - Oliver Heldens Remix,James Hype;Miggy Dela Rosa;Oliver Heldens,0.442293
7,Ferrari,James Hype;Miggy Dela Rosa,0.440875
8,Ammunition,Krewella,0.43972
9,Damaged,yetep;KLAXX;GLNNA,0.438088


In [41]:
get_recommendations_ohe('6Vc5wAMmXdKIAM7WUoEb7N')

Unnamed: 0,track_name,artist,similarity_score
0,Say Something,A Great Big World;Christina Aguilera,1.0
1,Say Something,A Great Big World,1.0
2,I'm into Something Good,Herman's Hermits,0.492249
3,Here,CASTLEBEAT,0.454346
4,Yoü And I,Ben Platt,0.453767
5,Sandy Kim,Acid Ghost,0.41518
6,UMPH,Retroj,0.403774
7,Perspective,Jorin Williams,0.403774
8,Wish,Nine Inch Nails,0.397947
9,Transcend,Fit For A King,0.397891
