# WORDS EMBEDDING - SONGS IN ENGLISH CORPUS
## - PARAMETROS OPTIMIZADOS PARA MEJOR ENTRENAMIENTO
## - PRUEBAS CON DIFERENTES CLUSTERS (KMEANS)

In [1]:
import pandas as pd
import numpy as np
import gensim.models.word2vec as w2v
import multiprocessing
import os
import re
import pprint
import sklearn.manifold
import matplotlib.pyplot as plt

Though non english artists were removed, the dataset contained Hindi lyrics of Lata Mangeshkar written in English. Therefore, I decided to remove all songs sung by her.

In [2]:
songs = pd.read_csv("data/songdata.csv", header=0)
#songs.head()
songs = songs[songs.artist != 'Lata Mangeshkar']
songs.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


To train the word2vec model, we first need to build its vocabulary. To do that, I iterated over each song and added it to an array that can later be fed to the model.

### VOY A EXTRAER MAS DIMENSIONES (100) PARA QUE SEA MÁS PRECISO Y BAJAR EL CONTEXT_SIZE  A 5 PARA EVITAR SOBRE-ENTRENAMIENTO

### ADEMAS VOY A USAR EL TOKINAZER PARA QUITAR LA PUNTUACION Y VER MEJOR LAS COMPARACINES

In [3]:
import nltk
text_corpus = []
for song in songs['text']:
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') #para dividir por words y quitar puntuacion
    lower_case = song.lower()
    tokens_sin_puntuacion = tokenizer.tokenize(lower_case)
    
    text_corpus.append(tokens_sin_puntuacion)


# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 100
# Minimum word count threshold.
min_word_count = 1

# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 5


downsampling = 1e-1

# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

songs2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

songs2vec.build_vocab(text_corpus)
print (len(text_corpus))

57618


### AÑADO MAS EPOCHS PARA QUE ENTRENE MEJOR

In [4]:
import time
start_time = time.time()



songs2vec.train(text_corpus, total_examples=songs2vec.corpus_count, epochs=5)

if not os.path.exists("trained"):
    os.makedirs("trained")

songs2vec.save(os.path.join("trained", "songs2vectors.w2v"))

print("--- %s seconds ---" % (time.time() - start_time))

--- 158.34136295318604 seconds ---


In [5]:
songs2vec = w2v.Word2Vec.load(os.path.join("trained", "songs2vectors.w2v"))

#### Let's explore our model

Find similar words

In [6]:
songs2vec.wv.most_similar("love")

[('disarm', 0.7033208608627319),
 ('hereafter', 0.6989443898200989),
 ('passionately', 0.6957560777664185),
 ('ohoo', 0.69503253698349),
 ('anointing', 0.6950174570083618),
 ('upsets', 0.6849844455718994),
 ('unconditional', 0.6818585395812988),
 ('surrendering', 0.6817238330841064),
 ('ensure', 0.6806694269180298),
 ('godspeed', 0.6806439161300659)]

In [7]:
songs2vec.wv.most_similar("fuck")

[('motherfuck', 0.7607970237731934),
 ('nigga', 0.7454285025596619),
 ('fuckin', 0.7432056069374084),
 ('yall', 0.7420084476470947),
 ('awards', 0.7091156840324402),
 ('bitch', 0.708276093006134),
 ('niggaare', 0.698380708694458),
 ('shit', 0.6960699558258057),
 ('juggla', 0.6939520835876465),
 ('sissy', 0.6926167011260986)]

In [8]:
songs2vec.wv.most_similar("song")

[('melody', 0.7953208684921265),
 ('tune', 0.7834556102752686),
 ('sing', 0.7649601697921753),
 ('songs', 0.7643988132476807),
 ('poem', 0.7428910732269287),
 ('chord', 0.7219515442848206),
 ('catchy', 0.712472677230835),
 ('singing', 0.7111738920211792),
 ('damo', 0.7054837942123413),
 ('nightingale', 0.7050484418869019)]

In [9]:
songs2vec.wv.most_similar("sweet")

[('bittersweet', 0.7261318564414978),
 ('chincherinchee', 0.7038120031356812),
 ('augusta', 0.6957032680511475),
 ('embraceable', 0.6935324668884277),
 ('ellie', 0.6808336973190308),
 ('sugar', 0.6772453784942627),
 ('cordelia', 0.6713442802429199),
 ('ohm', 0.6562048196792603),
 ('honeycomb', 0.6559844017028809),
 ('sweetest', 0.653556227684021)]

In [10]:
songs2vec.wv.most_similar("angel")

[('guardian', 0.7270585894584656),
 ('orphan', 0.6333507895469666),
 ('starlight', 0.6308083534240723),
 ('angels', 0.6297202110290527),
 ('heaven', 0.6272406578063965),
 ('idol', 0.6212413311004639),
 ('contacting', 0.6100413799285889),
 ('aeroplane', 0.6070394515991211),
 ('wings', 0.6032897233963013),
 ('skyway', 0.603134036064148)]

### TODAS LAS ANTERIORES LAS HA HECHO MUY BIEN PORQUE ESTÁN RELACIONADAS CON CANCIONES! LAS SIGUIENTES LE VA A COSTAR UN POCO MÁS

In [11]:
songs2vec.wv.most_similar("espresso")

[('passwords', 0.9013238549232483),
 ('sublurr', 0.899702787399292),
 ('hanes', 0.8993903398513794),
 ('shishkabob', 0.895525336265564),
 ('frisby', 0.8951556086540222),
 ('asee', 0.8944345116615295),
 ('toyin', 0.8942302465438843),
 ('levity', 0.8934356570243835),
 ('seewhat', 0.8930048942565918),
 ('exert', 0.8926528692245483)]

In [12]:
songs2vec.wv.most_similar("computer")

[('monopoly', 0.6785389184951782),
 ('outlet', 0.676584005355835),
 ('scientists', 0.6485586166381836),
 ('assassination', 0.648101270198822),
 ('technology', 0.632548451423645),
 ('101', 0.6320898532867432),
 ('techno', 0.6310076713562012),
 ('britannia', 0.6253201961517334),
 ('bracket', 0.6245629787445068),
 ('age', 0.6225196123123169)]

In [13]:
songs2vec.wv.most_similar("data")

[('chiggie', 0.6926277875900269),
 ('chronics', 0.6763949394226074),
 ('embellishments', 0.675311267375946),
 ('processin', 0.6720625162124634),
 ('checkbooks', 0.6718614101409912),
 ('vape', 0.6707996726036072),
 ('199', 0.6700356602668762),
 ('scoopin', 0.6635674834251404),
 ('testifiers', 0.6619908213615417),
 ('processed', 0.6619383096694946)]

### LO MISMO VA A PASAR CON LAS WORDS OUT OF CONTEXT

Words out of context

In [14]:
songs2vec.wv.doesnt_match("happiness love joy hate".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'hate'

In [15]:
songs2vec.wv.doesnt_match("breakfast milk lunch dinner".split())

'milk'

In [16]:
songs2vec.most_similar(positive=['woman', 'king'], negative=['man'])
#queen

  """Entry point for launching an IPython kernel.


[('queen', 0.64943528175354),
 ('kings', 0.629342257976532),
 ('newborn', 0.6213217973709106),
 ('homecoming', 0.6022508144378662),
 ('myrrh', 0.5890592336654663),
 ('shepards', 0.5875892639160156),
 ('frankincense', 0.5826840996742249),
 ('alleluia', 0.580947756767273),
 ('princess', 0.580915629863739),
 ('redeeming', 0.578170895576477)]

Semantic distance between words

In [17]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = songs2vec.wv.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{0} es a {1}, lo que {2} es a {3}".format(start1, end1, start2, end2))

In [18]:
nearest_similarity_cosmul("paris", "france", "alabama")

paris es a france, lo que tennessee es a alabama


In [19]:
nearest_similarity_cosmul("paris", "france", "london")

paris es a france, lo que stumblin es a london


### Con estas diferentes palabras que hemos probado podemos ver como para palabras similares a lo que hay en una canción, lo hace muy bien, pero para palabras extrañas musicalmente hablando (como paises y capitales) le cuesta BASTANTE

## CALCULO DE NORMALIZED SUM VECTOR

#### PRIMERO CREAMOS UNA COLUMNA EN EL DATAFRAME QUE CONTENGA LAS LETRAS LIMPIAS SIN PUNTUACION

In [20]:
lyrics_clean=[]
for row in text_corpus:
    lyrics_clean.append(' '.join(row))
    
songs['lyrics_clean']=lyrics_clean

With the word vector embeddings in place, it is now time to calculate the normalised vector sum of each song. This process can take some time since it has to be done for each of 57,000 songs.

In [None]:
def songVector(row):
    vector_sum = 0
    words = row.lower().split()
    for word in words:
        vector_sum = vector_sum + songs2vec[word]
    vector_sum = vector_sum.reshape(1,-1)
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    return normalised_vector_sum


import time
start_time = time.time()

songs['song_vector'] = songs['lyrics_clean'].apply(songVector)




## CLUSTERING

**t-sne and random song selection** 

The songs have 50 dimensions each. Application of t-sne is memory intensive and hence it is slightly easier on the computer to use a random sample of the 57,000 songs.

In [22]:
song_vectors = []
from sklearn.model_selection import train_test_split

train, test = train_test_split(songs, test_size = 0.9)


for song_vector in train['song_vector']:
    song_vectors.append(song_vector)

train.head(10)

Unnamed: 0,artist,song,link,text,lyrics_clean,song_vector
51938,Squeeze,Bonkers,/s/squeeze/bonkers_20129121.html,I know that I'm bonkers \nStupidity conquers ...,i know that i m bonkers stupidity conquers for...,"[[0.013346004, -0.078903735, 0.08808995, 0.059..."
3014,Chuck Berry,Bound To Lose,/c/chuck+berry/bound+to+lose_20514597.html,Looks like I'll go on through my life bound in...,looks like i ll go on through my life bound in...,"[[0.03512863, -0.09137119, 0.113696, 0.0800223..."
51718,Slayer,Seven Faces,/s/slayer/seven+faces_10217152.html,I saw them all around today \nThey don't stop...,i saw them all around today they don t stop th...,"[[0.03794212, -0.10772548, 0.08929828, 0.06925..."
30743,Ed Sheeran,Bloodstream,/e/ed+sheeran/bloodstream_21084969.html,[Verse 1] \nI've been spinning now for time ...,verse 1 i ve been spinning now for time couple...,"[[0.01696406, -0.08807455, 0.08102738, 0.07745..."
43618,Michael Bolton,The Best Of Love,/m/michael+bolton/the+best+of+love_20092316.html,I've got this somethin' to tell you \nThere a...,i ve got this somethin to tell you there ain t...,"[[-0.0065999404, -0.07798735, 0.099585906, 0.0..."
7860,Hillsong United,Kiss Of Heaven,/h/hillsong+united/kiss+of+heaven_20626583.html,I'm walking a new walk \nI'll never be the sa...,i m walking a new walk i ll never be the same ...,"[[0.010762967, -0.07980471, 0.10405661, 0.0553..."
47584,Paul McCartney,Daytime Nightime Suffering,/p/paul+mccartney/daytime+nightime+suffering_2...,What does she get for all the love she gave yo...,what does she get for all the love she gave yo...,"[[0.016002879, -0.07511213, 0.09460866, 0.0388..."
19806,Twenty One Pilots,The Run And Go,/t/twenty+one+pilots/the+run+and+go_21052895.html,"I can't take them on my own, my own \nOh, I'm...",i can t take them on my own my own oh i m not ...,"[[0.03876056, -0.11771952, 0.09810154, 0.10224..."
54544,Uriah Heep,Illusion,/u/uriah+heep/illusion_20142391.html,In a forest known as heartbreak \nIn a cleari...,in a forest known as heartbreak in a clearing ...,"[[0.013075363, -0.07886132, 0.06867011, 0.0591..."
22331,Z-Ro,Hey Lil Mama,/z/z+ro/hey+lil+mama_20223142.html,[Hook] \nHey there little mama \nWhy don't y...,hook hey there little mama why don t you come ...,"[[0.03921167, -0.08669282, 0.07312894, 0.08442..."


I had a fairly measly 4gb machine and wasn't able to generate a more accurate model. However, one can play around with the number of iterations, learning rate and other factors to fit the model better. If you have too many dimensions (~300+), it might make sense to use PCA first and then t-sne.

In [23]:
X = np.array(song_vectors).reshape((5761, 100))

start_time = time.time()
tsne = sklearn.manifold.TSNE(n_components=2, n_iter=250, random_state=0, verbose=2)

all_word_vectors_matrix_2d = tsne.fit_transform(X)

print("--- %s seconds ---" % (time.time() - start_time))

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5761 samples in 0.047s...
[t-SNE] Computed neighbors for 5761 samples in 9.610s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5761
[t-SNE] Computed conditional probabilities for sample 2000 / 5761
[t-SNE] Computed conditional probabilities for sample 3000 / 5761
[t-SNE] Computed conditional probabilities for sample 4000 / 5761
[t-SNE] Computed conditional probabilities for sample 5000 / 5761
[t-SNE] Computed conditional probabilities for sample 5761 / 5761
[t-SNE] Mean sigma: 0.055802
[t-SNE] Computed conditional probabilities in 0.333s
[t-SNE] Iteration 50: error = 87.8986588, gradient norm = 0.0669048 (50 iterations in 10.104s)
[t-SNE] Iteration 100: error = 87.8547974, gradient norm = 0.0857636 (50 iterations in 7.647s)
[t-SNE] Iteration 150: error = 87.5769730, gradient norm = 0.0904535 (50 iterations in 6.177s)
[t-SNE] Iteration 200: error = 88.0242767, gradient norm = 0.0597690 (50 iterations in 5.852s)
[

In [24]:
df=pd.DataFrame(all_word_vectors_matrix_2d,columns=['X','Y'])

df.head(10)

train.head()

df.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)

Joining two dataframes to obtain each song's corresponding X,Y co-ordinate.

In [25]:
two_dimensional_songs = pd.concat([train, df], axis=1)

two_dimensional_songs.head()


Unnamed: 0,artist,song,link,text,lyrics_clean,song_vector,X,Y
0,Squeeze,Bonkers,/s/squeeze/bonkers_20129121.html,I know that I'm bonkers \nStupidity conquers ...,i know that i m bonkers stupidity conquers for...,"[[0.013346004, -0.078903735, 0.08808995, 0.059...",0.017184,0.042948
1,Chuck Berry,Bound To Lose,/c/chuck+berry/bound+to+lose_20514597.html,Looks like I'll go on through my life bound in...,looks like i ll go on through my life bound in...,"[[0.03512863, -0.09137119, 0.113696, 0.0800223...",-0.008499,-0.040928
2,Slayer,Seven Faces,/s/slayer/seven+faces_10217152.html,I saw them all around today \nThey don't stop...,i saw them all around today they don t stop th...,"[[0.03794212, -0.10772548, 0.08929828, 0.06925...",-0.000178,0.009316
3,Ed Sheeran,Bloodstream,/e/ed+sheeran/bloodstream_21084969.html,[Verse 1] \nI've been spinning now for time ...,verse 1 i ve been spinning now for time couple...,"[[0.01696406, -0.08807455, 0.08102738, 0.07745...",0.009535,0.034423
4,Michael Bolton,The Best Of Love,/m/michael+bolton/the+best+of+love_20092316.html,I've got this somethin' to tell you \nThere a...,i ve got this somethin to tell you there ain t...,"[[-0.0065999404, -0.07798735, 0.099585906, 0.0...",0.000493,-0.01064


**Plotting the results**

Using plotly, I plotted the results so that it becomes easier to explore similar songs based on their colors and clusters.

In [26]:
import plotly
plotly.offline.init_notebook_mode(connected=True) 

In [27]:
import plotly.express as px
fig=px.scatter(two_dimensional_songs, x='X', y='Y', color='artist')
fig.show()

In [28]:
import plotly.express as px
fig = px.scatter_3d(two_dimensional_songs, x='X', y='Y', z='song',
                color='artist')
fig.show()

# CLUSTERING CON KMEANS

In [29]:
from sklearn import cluster
X = np.array(song_vectors).reshape((5761, 100))

kmeans = cluster.KMeans(n_clusters=3, 
                        random_state=42).fit(X)

In [30]:
import plotly.express as px
fig = px.scatter(two_dimensional_songs, x="X", y="Y",
                 hover_data=['artist', 'song'],
                color=kmeans.labels_)
fig.show()

In [31]:
import plotly.express as px
fig = px.scatter_3d(two_dimensional_songs, x='X', y='Y', z='artist',
                color=kmeans.labels_)
fig.show()

### PINTANDO LOS DISTINTOS CLUSTERS DE KMEANS PODEMOS ENCONTRAR QUE CANCIONES SIMILARES EN TEMATICA SE AGRUPAN JUNTAS
- Por ejemplo las canciones de amor suelen estar en el cluster amarilo (Adam Sandler - Best Friend, whiteny Houston - For the love of you)
- En el azul hay música más"independiente" que quizá creando más clusters podriamos categorizar mejor (probamos a continuacion)
- En el rosa parece haber música más energética como raps y rocks

# KMEANS CON 15 CLUSTERS

In [32]:
kmeans = cluster.KMeans(n_clusters=15, 
                        random_state=42).fit(X)

In [33]:
import plotly.express as px
fig = px.scatter(two_dimensional_songs, x="X", y="Y",
                 hover_data=['artist', 'song'],
                color=kmeans.labels_)
fig.show()

### AQUI PODEMOS VER QUE LA AGRUPACION TIENE SENTIDO CON RESPECTO A LAS CARACTERISTICAS X E Y. EL COLOR PASA DE CLARO A OSCURO FORMANDO DISTINTOS CLUSTERS

## ANALIZANDO LAS LETRAS DE LAS CANCIONES DE CADA CLUSTER SE PODRIA VER QUE MUCHAS PALABRAS SE REPITEN PARA LAS CANCIONES DE UN MISMO CLUSTER