# Sentence to music

### Loading Libraries

In [33]:
import warnings
warnings.filterwarnings('ignore')

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from lyricsgenius import Genius
import pandas as pd
import transformers
import numpy as np
import requests
import string
import random
import nltk
import json
import re

In [35]:
ACCESS_TOKEN = 'qm34e4S6Q-DDVa99o8EYksGjd9BqbJGmCkS-Vu7YCxojc7IgqiYb89xPc1Nf2eYk'
HEADERS = {'Authorization': f'Bearer {ACCESS_TOKEN}'}
token = ACCESS_TOKEN
genius = Genius(token)

### Configuring API request

In [36]:
# Turn off status messages
genius.verbose = False

# Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.remove_section_headers = True

# Include hits thought to be non-songs (e.g. track lists)
genius.skip_non_songs = False

# Exclude songs with these words in their title
genius.excluded_terms = ["(Remix)", "(Live)"]

### Loading the data

In [40]:
artists = ['Drake', 'Tame Impala', 'Taylor Swift', 'Freddie Gibbs', 'SZA', 'Swae Lee', 'Tory Lanez', 'Sia', 'Daft Punk', 'Doja Cat', 'Bruno Mars', 'Frank Ocean', 'Post Malone', 'Nirvana', 'Bryson Tiller', 'The Beatles', 'Zach Bryan', 'Joji', 'Akon', 'Dua Lipa', 'Rod Wave', 'Justin Timberlake', 'Olivia Rodrigo', 'Rema', 'Arctic Monkeys', 'Bee Gees', 'Imagine Dragons', 'Radiohead', 'Oasis', 'Queen', 'Maroon 5', 'Eagles', 'Metallica', 'The Weeknd', 'XXXTENTACION', 'Kendrick Lamar', 'Good Kid', 'J. Cole', 'Travis Scott', 'Kanye West', 'Lil Uzi Vert', 'Lil Baby', 'Lil Wayne', 'Lil Nas X', 'Lil Durk', 'Lil Yachty', 'Lil Tjay', 'Lil Tecca', 'Lil Skies', 'Lil Mosey', 'Lil Peep', 'Lil Dicky',  'Lil Pump', 'Lil Xan', 'Lil Reese', 'Lil Keed', 'Baby Keem', 'Shakira', 'Wiz Khalifa', 'Mitski', 'Pardison Fontaine', 'Nas', 'Fall Out Boy', 'Ed Sheeran', 'Bonnie Tyler', 'Sampha', 'Halsey', 'Rita Ora', 'Chance The Rapper', 'Childish Gambino', 'Owl City', 'Aaron May', 'Summer Walker', 'Jhené Aiko']
#for artist in artists:
    #artist = genius.search_artist(artist, max_songs=7, sort="title")
    # save data to a json file in the current directory
    #artist.save_lyrics()
    #print(f'Finished saving {artist.name}\'s lyrics')

### Parsing the data

In [41]:
artists_stripped = [artist.replace(' ', '') for artist in artists]
# Import all the json files into a dataframe
df = pd.DataFrame()
for artist in artists_stripped:
    with open(f'Lyrics_{artist}.json') as json_file:
        try:
            data = json.load(json_file)
            df = df.append(pd.DataFrame(data['songs']))
        except:
            print(f'Could not load {artist}\'s lyrics')

In [42]:
# save the dataframe to a csv file
df.to_csv('lyrics.csv', index=False)

In [43]:
# Keeping relevant features only
df = df[['artist_names', 'release_date', 'language', 'featured_artists', 'full_title', 'title', 'album', 'producer_artists', 'writer_artists', 'artist', 'lyrics']]

# remove rows where release_date is null
df = df[df['release_date'].notnull()]

# Parse 'writer_artists', 'producer_artists', 'features_artists' and 'album' columns
df['writer_artists'] = df['writer_artists'].apply(lambda x: [artist['name'] for artist in x])
df['producer_artists'] = df['producer_artists'].apply(lambda x: [artist['name'] for artist in x] if x else None)
df['featured_artists'] = df['featured_artists'].apply(lambda x: [artist['name'] for artist in x] if x else None)
df['album'] = df['album'].apply(lambda x: x['name'] if x else None)

#The lyrics column should start after the string "Contributor"
df['lyrics'] = df['lyrics'].apply(lambda x: x[x.find('Contributor') + 12:])

# if '\n' in lyrics, lyrics should start after the first '\n'
df['lyrics'] = df['lyrics'].apply(lambda x: x[x.find('\n') + 1:] if '\n' in x else x)

In [44]:
df.iloc[0]['lyrics']

'\nFuck bein\' on some chill shit\nWe go 0 to 100, nigga, real quick\nThey be on that rap-to-pay-the-bills shit\nAnd I don\'t feel that shit, not even a little bit\nOh, Lord, know yourself, know your worth, nigga\nMy actions been louder than my words, nigga\nHow you so high, but still so down to Earth, nigga?\nNiggas wanna do it, we can do it on they turf, nigga\nOh, Lord, I\'m the rookie and the vet\nShoutout to the bitches out here holdin\' down the set\nAll up in my phone, lookin\' at pictures from the other night\nShe gon\' be upset if she keep scrollin\' to the left, dawg\nShe gon\' see some shit that she don\'t wanna see\nShe ain\'t ready for it\nIf I ain\'t the greatest, then I\'m headed for it\nYeah, that mean I\'m way up (Way up)\nYeah, the 6 ain\'t friendly, but that\'s where I lay up\nThis shit a mothafuckin\' lay-up\nI been Steph Curry with the shot\nBeen cookin\' with the sauce, Chef Curry with the pot, boy\n360 with the wrist, boy\nAyy, who the fuck them niggas is, boy?\n

In [45]:
df

Unnamed: 0,artist_names,release_date,language,featured_artists,full_title,title,album,producer_artists,writer_artists,artist,lyrics
1,Drake,2014-06-01,en,,0 to 100 / The Catch Up by Drake,0 to 100 / The Catch Up,#FYM12,"[Vinylz, Ging, Boi-1da, 40, Nineteen85]","[Chester Hansen, Ging, Nineteen85, Drake, Boi-...",Drake,\nFuck bein' on some chill shit\nWe go 0 to 10...
2,Drake,2015-02-13,en,,10 Bands by Drake,10 Bands,If You’re Reading This It’s Too Late,"[Ging, Sevn Thomas, Boi-1da]","[Sevn Thomas, Ging, Boi-1da, Quentin Miller, D...",Drake,"10 Bands, 50 bands, 100 bands, fuck it, man\nL..."
3,Drake,2010-06-11,en,,1Xtra Freestyle by Drake,1Xtra Freestyle,Tim Westwood I Freestyles,[Tim Westwood],[Drake],Drake,Who else really trying to mess with Hollywood ...
0,Tame Impala,2010-05-27,en,,30 Minutes with Mathew Saville by Tame Impala,30 Minutes with Mathew Saville,Innerspeaker (Collector’s Edition),[Kevin Parker],[Kevin Parker],Tame Impala,This song is an instrumentalEmbed
1,Tame Impala,2011-05-27,en,,41 Mojitos (Canyons Poolside Dub) by Tame Impala,41 Mojitos (Canyons Poolside Dub),Innerspeaker (Collector’s Edition),[Kevin Parker],[Kevin Parker],Tame Impala,"(Whispering)\nAhhh-ahhh, ahhh-ahhh, ahhh-ahhh,..."
...,...,...,...,...,...,...,...,...,...,...,...
2,Jhené Aiko,2012-05-20,en,,2 Seconds by Jhené Aiko,2 Seconds,Sailing Soul(s) (Streaming Version),[K. Roosevelt],"[K. Roosevelt, Jhené Aiko]",Jhené Aiko,"Yeah, yeah\nAlright, yeah, yeah\n\nOkay, you b..."
3,Jhené Aiko,2012-08-16,en,,3:16 am by Jhené Aiko,3:16 am,Sail Out - EP,[The Fisticuffs],"[Jhené Aiko, Brian Warfield, Mac Robinson]",Jhené Aiko,"Out of place, out of space and time, wide awak..."
4,Jhené Aiko,2020-07-17,en,,Above and Beyond by Jhené Aiko,Above and Beyond,Chilombo (Deluxe),[The Fisticuffs],[Jhené Aiko],Jhené Aiko,"Like yesterday, I remember the night, saw you ..."
5,Jhené Aiko,2020-07-17,en,,Above and Beyond (Piano) by Jhené Aiko,Above and Beyond (Piano),Chilombo (Deluxe),[LEJKEYS],"[Mac Robinson, Jhené Aiko, Brian Warfield]",Jhené Aiko,"Like yesterday, I remember the night\nSaw you ..."


## Modeling

### Model 0

In [46]:
# get tfidf of lyrics
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['lyrics'])

tfidf_matrix

<320x8816 sparse matrix of type '<class 'numpy.float64'>'
	with 30949 stored elements in Compressed Sparse Row format>

In [53]:
# input mood sentence and get its tfidf by using the already fitted tfidf
input_sentence = 'I am feeling sad'
tfidf_input = tfidf.transform([input_sentence])

In [54]:
# get cosine similarity between input sentence and all the lyrics
cosine_sim = cosine_similarity(tfidf_input, tfidf_matrix)

# get the index of the n most similar song and print the title and artist of the song
n = 5
most_similar = np.argsort(cosine_sim[0])[-n:]
for i in most_similar:
    print(df.iloc[i]['title'], 'by', df.iloc[i]['artist_names'])

1 Call by Tory Lanez
Action This Day (Live at the Milton Keynes Bowl, 5th June 1982) by Queen
0C3AN *clip* by XXXTENTACION
Anywhere / Your Song / For You (Medley) (Live at the BRITs) by Rita Ora (Ft. Liam Payne)
A Loving Feeling by Mitski


In [57]:
class GetPlaylist():
    def __init__(self, number_of_songs=5):
        self.number_of_songs = number_of_songs

    def train_model(self, df):
        self.tfidf = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.tfidf.fit_transform(df['lyrics'])

    def get_playlist(self, input_sentence):
        self.input_sentence = input_sentence
        self.tfidf_input = self.tfidf.transform([self.input_sentence])
        self.cosine_sim = cosine_similarity(self.tfidf_input, self.tfidf_matrix)
        self.most_similar = np.argsort(self.cosine_sim[0])[-self.number_of_songs:]
        self.playlist = []
        for i in self.most_similar:
            self.playlist.append((df.iloc[i]['title'], df.iloc[i]['artist_names']))
        return self.playlist     

### Model 2

In [22]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
np.shape(output[0].detach().numpy())

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(1, 12, 768)

In [15]:
# Applying model to the lyrics
lyrics = df['lyrics'].tolist()

# Tokenize the lyrics
tokenized_lyrics = [tokenizer(lyric, return_tensors='pt', padding=True, truncation=True) for lyric in lyrics]

df['tokenized_lyrics'] = tokenized_lyrics


In [47]:
# Apply the model to the tokenized lyrics to get columns 'lyrics_last_hidden_state'

last_hidden_states = []

for tokenized_lyric in tokenized_lyrics:
    last_hidden_states.append(model(**tokenized_lyric)[0])

In [48]:
for i in range(len(last_hidden_states)):
    last_hidden_states[i] = last_hidden_states[i].detach().numpy().squeeze()

    

In [50]:
# pad the first dimension of all the hidden states to the same length
max_len = max([np.shape(hidden_state)[0] for hidden_state in last_hidden_states])
for i in range(len(last_hidden_states)):
    last_hidden_states[i] = np.pad(last_hidden_states[i], ((0, max_len - np.shape(last_hidden_states[i])[0]), (0, 0)), 'constant', constant_values=0)

In [None]:
last_hidden_states = [hidden_state.reshape(-1) for hidden_state in last_hidden_states]

In [84]:
input_sentence = 'I really really like frank ocean he is the best artist that has ever touched this planet I cant wait to see him in a concert'

# Tokenize the input sentence
tokenized_input = tokenizer(input_sentence, return_tensors='pt', padding=True, truncation=True)

# Apply the model to the tokenized input sentence to get columns 'input_last_hidden_state'
input_last_hidden_state = model(**tokenized_input)[0].detach().numpy().squeeze()

# pad the first dimension of the hidden state to the same length as the hidden states of the lyrics
input_last_hidden_state = np.pad(input_last_hidden_state, ((0, max_len - np.shape(input_last_hidden_state)[0]), (0, 0)), 'constant', constant_values=0)

# flatten the last two dimensions of the hidden states
input_last_hidden_state = input_last_hidden_state.reshape(-1)

In [85]:
# get cosine similarity between input sentence and all the lyrics
cosine_sim = cosine_similarity([input_last_hidden_state], last_hidden_states)

n=3
most_similar = np.argsort(cosine_sim[0])[-n:]
for i in most_similar:
    print(df.iloc[i]['title'], 'by', df.iloc[i]['artist_names'])

7:77AM (Instrumental) by Nekfeu
30 Minutes with Mathew Saville by Tame Impala
Aerodynamic Beats / Forget About the World by Daft Punk
