In [1]:
import nltk

import matplotlib.pyplot as plt
import numpy as np

from collections import Counter
import pathlib
import re
import random
from gensim.models import Word2Vec

%matplotlib inline

## About The Experiment
Just an item2vec for generating embeddings for each song.

### Why Random Walk?
Since the size of the actual data was quite low, we generate our own set using a random walk in the graph. Following steps loosely translates to the algorithm:

**Load and Create Graph**
- load the log
- encode each unique song with a unique id
- generate a trie data structure with each node as a single song from data 
- we have a dictionary-based graph strucuture 

**Walk Randomly**
- select a random node from the graph
- get next nodes (states)
- choose the one with highest probability
- repeat this for a fixed number of times
- in the end, we have a sequence of nodes

Note: We repeat this for a random number **n** times.

**Generate Data**
For certain number of generations (say ngen=15000), we keep on applying random walk. In the end we get the a list of list structure, with each row representing a sequence of nodes. Eg:
```bash
['142', '455', '109', '443'],
['912', '538', '545', '536', '537'],
['730', '732', '734', '736', '738'],
['397', '248', '864', '674', '396', '395'],
['703', '697', '115', '709', '89', '511']
 ```
 
**Apply Word2Vec**
Finally, generate word embeddings using word2vec using the generated data.

In [2]:
## the lines of code here are experimental and aren't of proper standards! 
## :/

In [3]:
path = pathlib.Path("~/.playx/logs/log.cat").expanduser()

### Pre-process

In [4]:
def remove_punct(string):
    string = re.sub(r"[']+", '', string)
    return re.sub(r"[-:_!,/\[\].()#?;&]+", ' ', string)

def remove_multiple_spaces(string):
    return re.sub(r'\s+', ' ', string)

### Data Loader
This piece of code is taken from a module in `playx`. :/

In [5]:
def get_timeseries_data(logpath):
    data = []
    with open(logpath) as f:
        for line in f:
            line = line.strip().lower()
            if 'playing' not in line:
                continue
            matches = re.findall(r"\[.*?\]", line)
            module, timestamp = matches[0], matches[1]
            try:
                song = matches[2]
                ts = re.sub(r"[\[\]]+", '', timestamp).strip()
                song = re.sub(r"[\[\]]+", '', song)
                song = remove_punct(song)
                song = remove_multiple_spaces(song).strip()
                if song:
                    data.append((ts, song))
            except IndexError:
                continue
    return data

In [6]:
data = get_timeseries_data(path)

In [7]:
data[:5]

[('2018-12-31 21:36:09,683', 'nirvana smells like teen spirit'),
 ('2019-01-01 08:05:06,277',
  'nirvana the man who sold the world mtv unplugged'),
 ('2019-01-01 08:07:56,794', 'leonard cohen famous blue raincoat audio'),
 ('2019-01-01 08:08:26,593', 'gary jules mad world song + lyrics'),
 ('2019-01-01 08:08:26,712',
  'the sound of silence original version from 1964')]

In [8]:
ts, songs = zip(*data)

In [9]:
len(songs)

2357

### Frequeny Test

In [10]:
counter = Counter(songs)

In [11]:
counter.most_common(10)

[('bimbaakash khai lyrical video', 75),
 ('timecop1983 girl feat seawaves official video', 72),
 ('bimbaakash najeek lyrical video', 53),
 ('nirvana the man who sold the world mtv unplugged', 36),
 ('guthrie govan erotic cakes full album', 28),
 ('leonard cohen famous blue raincoat audio', 27),
 ('queen bohemian rhapsody official lyric video', 25),
 ('bimbaakash timi ra ma lyrical video', 22),
 ('the sound of silence original version from 1964', 21),
 ('jim croce time in a bottle 1973', 21)]

In [12]:
songs_unique = list(set(songs))

In [13]:
len(songs_unique)

916

In [14]:
songs_unique[:5]

['ac dc happy new year have a drink on me',
 'the pineapple thief the one you left to die mp3',
 'aerosmith rag doll',
 'fight song rachel platten | lyrics',
 'rishloo dark charade']

### Create Song-ID map

In [15]:
def map_song_to_id(unique):
    """
        Assign each song a unique ID
    """
    unique = sorted(unique)
    res = {}
    for i, song in enumerate(unique):
        res[song] = str(i)
    return res

In [16]:
song_to_idx = map_song_to_id(songs_unique)
list(song_to_idx.items())[:5]

[('"the cave" mumford sons official lyrics', '0'),
 ('01 dirty deeds done dirt cheap', '1'),
 ('10cc art for arts sake', '2'),
 ('22 the wall pink floyd run like hell', '3'),
 ('3 doors down kryptonite lyrics', '4')]

In [17]:
def map_id_to_song(song_to_idx):
    """
        Reverse the song_to_idx map
    """
    return {idx:song for song, idx in song_to_idx.items()}

In [18]:
idx_to_song = map_id_to_song(song_to_idx)

In [19]:
list(idx_to_song.items())[:5]

[('0', '"the cave" mumford sons official lyrics'),
 ('1', '01 dirty deeds done dirt cheap'),
 ('2', '10cc art for arts sake'),
 ('3', '22 the wall pink floyd run like hell'),
 ('4', '3 doors down kryptonite lyrics')]

### Encode
Encode the songs in the log to IDs. This is done for easeness.

In [20]:
def encode(song_to_idx, songs):
    res = []
    for song in songs:
        res.append(song_to_idx[song])
    return res

In [21]:
sequence_encoded = encode(song_to_idx, songs)

In [22]:
len(sequence_encoded), len(songs)

(2357, 2357)

In [23]:
sequence_encoded[:5], [idx_to_song[idx] for idx in sequence_encoded[:5] ]

(['445', '446', '398', '249', '865'],
 ['nirvana smells like teen spirit',
  'nirvana the man who sold the world mtv unplugged',
  'leonard cohen famous blue raincoat audio',
  'gary jules mad world song + lyrics',
  'the sound of silence original version from 1964'])

### Build Trie

In [24]:
pairs = list(zip(sequence_encoded, sequence_encoded[1:]))

In [25]:
pairs[:5]

[('445', '446'),
 ('446', '398'),
 ('398', '249'),
 ('249', '865'),
 ('865', '675')]

In [26]:
def build_trie(pairs):
    trie = {}
    for pair in pairs:
        a, b = pair
        if a not in trie:
            trie[a] = {}
        if b not in trie[a]:
            trie[a][b] = 1
        else:
            trie[a][b] += 1
    return trie

In [27]:
trie = build_trie(pairs)

In [28]:
def build_probabilities(trie):
    for word, following in trie.items():
        total = sum(following.values())
        for key in following:
            following[key] /= total
    return trie

In [29]:
trie = build_probabilities(trie)

In [30]:
list(trie.items())[:5]

[('445', {'446': 0.5, '350': 0.25, '266': 0.25}),
 ('446',
  {'398': 0.3888888888888889,
   '446': 0.16666666666666666,
   '340': 0.05555555555555555,
   '402': 0.027777777777777776,
   '868': 0.1388888888888889,
   '674': 0.1111111111111111,
   '549': 0.027777777777777776,
   '300': 0.027777777777777776,
   '282': 0.027777777777777776,
   '9': 0.027777777777777776}),
 ('398',
  {'249': 0.6296296296296297,
   '398': 0.1111111111111111,
   '117': 0.037037037037037035,
   '868': 0.037037037037037035,
   '561': 0.037037037037037035,
   '250': 0.037037037037037035,
   '394': 0.037037037037037035,
   '129': 0.037037037037037035,
   '408': 0.037037037037037035}),
 ('249', {'865': 0.8, '249': 0.15, '398': 0.05}),
 ('865',
  {'675': 0.3333333333333333,
   '865': 0.14285714285714285,
   '507': 0.047619047619047616,
   '117': 0.047619047619047616,
   '868': 0.047619047619047616,
   '363': 0.3333333333333333,
   '119': 0.047619047619047616})]

### Walk Randomly :/

In [31]:
import random

In [32]:
nseq = 10

In [33]:
def walk_randomly(trie, sequence_encoded):
    """
        Be fully drunk and just walk within the graph.
    """
    node = random.choice(sequence_encoded)
    res = [node]
    n = random.choice(range(2, 7))
    for i in range(n):
        #print("Current node :: {}".format(node))
        next_nodes = trie.get(node, [])
        if not next_nodes:
            break
        next_nodes = sorted(next_nodes.items(), key=lambda x : x[1], reverse=True)
        nodes, probs = zip(*next_nodes)
#         node = next_nodes[0][0]
        node = np.random.choice(nodes, p=probs)
#         if node in res:
#             break
        res.append(node)
    return res

In [34]:
walk_randomly(trie, sequence_encoded)

['205', '209', '207', '211']

In [35]:
N = 20000
data_walked = [walk_randomly(trie, sequence_encoded) for i in range(N)]

In [36]:
data_walked[:5]

[['763', '413', '729', '731'],
 ['399', '394', '398'],
 ['342', '349', '338', '353', '354', '348', '445'],
 ['595', '610', '602', '592', '607', '606'],
 ['763', '205', '209', '207', '211', '205']]

## Generate Embeddings

In [37]:
model = Word2Vec(data_walked, min_count=1, size=100, window=3, sg=1)

In [38]:
print(model)

Word2Vec(vocab=916, size=100, alpha=0.025)


In [39]:
items = list(model.wv.vocab)
len(items), len(songs_unique)

(916, 916)

## Test Similarity

In [43]:
# generate random song
test_song = random.choice(counter.most_common(n=300))[0]
test_id = song_to_idx[test_song]
# test_id = random.randint(0, len(items))
song = idx_to_song[test_id]
test_id, song, counter[song]

('567', 'quot white rabbit quot jefferson airplane lyrics mp3', 2)

In [44]:
similar = model.wv.most_similar(positive=[test_id], topn=10)

In [45]:
for idx, val in similar:
    print(idx_to_song[idx])

quot white rabbit quot jefferson airplane lyrics
karen dalton something on your mind
gary julesmad world song + lyrics mp3
karen dalton it hurts me too sub
hope sandoval amp the warm inventions on the low
leonard cohen famous blue raincoat audio mp3
hope sandoval and the warm inventions trouble official music video
gary julesmad world song + lyrics
leonard cohen a thousand kisses deep lyrics hd
hope sandoval amp the warm inventions let me get there ft kurt vile
