In [28]:
import nltk

import matplotlib.pyplot as plt
import numpy as np

from collections import Counter
import pathlib
import re
import random
from gensim.models import Word2Vec

%matplotlib inline

## About The Experiment
The idea is to make use of sequence of played songs for embeddings. Techinically, it's same (or similar) to Word Embeddings. But here, we treat each song as a single word. Hence, it's more profound to say it is **Item2Vec** instead of **Word2Vec**.

When training is done, we will have songs that are played together to occupy similar point in the n-dimensional space. This technique is often used in recommendation system. So, here the process is merely an experiment for that.

On a serious note, this process requires a large amount of data. For now, the data used it from log file at `~/.playx/logs/log.cat`. If data is not sufficient, the embeddings are not good enough for actual recommendations.

In [29]:
## the lines of code here are experimental and aren't of proper standards! 
## :/

In [30]:
path = pathlib.Path("~/.playx/logs/log.cat").expanduser()

### Pre-process

In [31]:
def remove_punct(string):
    string = re.sub(r"[']+", '', string)
    return re.sub(r"[-:_!,/\[\].()#?;&]+", ' ', string)

def remove_multiple_spaces(string):
    return re.sub(r'\s+', ' ', string)

### Data Loader
This piece of code is taken from a module in `playx`. :/

In [32]:
def get_timeseries_data(logpath):
    data = []
    with open(logpath) as f:
        for line in f:
            line = line.strip().lower()
            if 'playing' not in line:
                continue
            matches = re.findall(r"\[.*?\]", line)
            module, timestamp = matches[0], matches[1]
            try:
                song = matches[2]
                ts = re.sub(r"[\[\]]+", '', timestamp).strip()
                song = re.sub(r"[\[\]]+", '', song)
                song = remove_punct(song)
                song = remove_multiple_spaces(song).strip()
                if song:
                    data.append((ts, song))
            except IndexError:
                continue
    return data

In [33]:
data = get_timeseries_data(path)

In [34]:
data[:5]

[('2018-12-31 21:36:09,683', 'nirvana smells like teen spirit'),
 ('2019-01-01 08:05:06,277',
  'nirvana the man who sold the world mtv unplugged'),
 ('2019-01-01 08:07:56,794', 'leonard cohen famous blue raincoat audio'),
 ('2019-01-01 08:08:26,593', 'gary jules mad world song + lyrics'),
 ('2019-01-01 08:08:26,712',
  'the sound of silence original version from 1964')]

In [35]:
ts, songs = zip(*data)

In [36]:
len(songs)

2212

### Frequeny Test

In [37]:
counter = Counter(songs)

In [38]:
counter.most_common(10)

[('bimbaakash khai lyrical video', 75),
 ('timecop1983 girl feat seawaves official video', 72),
 ('bimbaakash najeek lyrical video', 53),
 ('nirvana the man who sold the world mtv unplugged', 36),
 ('guthrie govan erotic cakes full album', 28),
 ('leonard cohen famous blue raincoat audio', 27),
 ('queen bohemian rhapsody official lyric video', 25),
 ('bimbaakash timi ra ma lyrical video', 22),
 ('the sound of silence original version from 1964', 21),
 ('jim croce time in a bottle 1973', 21)]

In [39]:
songs_unique = list(set(songs))

In [40]:
len(songs_unique)

830

In [41]:
songs_unique[:5]

['pure narcotic porcupine tree',
 '6 the wall pink floyd mother',
 'the paper kites holes',
 '10cc art for arts sake',
 'guthrie govan erotic cakes full album']

### Create Song-ID map

In [42]:
def generate_song_to_id(unique):
    res = {}
    for i, song in enumerate(unique):
        res[song] = i
    return res

In [43]:
song_to_id = generate_song_to_id(songs_unique)
list(song_to_id.items())[:5]

[('pure narcotic porcupine tree', 0),
 ('6 the wall pink floyd mother', 1),
 ('the paper kites holes', 2),
 ('10cc art for arts sake', 3),
 ('guthrie govan erotic cakes full album', 4)]

In [44]:
def generate_id_sequence(song_to_id, songs):
    res = []
    for song in songs:
        res.append(song_to_id[song])
    return res

In [45]:
song_ids = generate_id_sequence(song_to_id, songs)

In [46]:
def generate_id_to_song(song_to_id):
    return {
        song_to_id[id]:id
        for id in song_to_id
    }

In [47]:
id_to_song = generate_id_to_song(song_to_id)

In [48]:
list(id_to_song.items())[:5]

[(0, 'pure narcotic porcupine tree'),
 (1, '6 the wall pink floyd mother'),
 (2, 'the paper kites holes'),
 (3, '10cc art for arts sake'),
 (4, 'guthrie govan erotic cakes full album')]

### Convert Song Names to IDs

In [49]:
song_ids_sequence = [ str(sid) for sid in song_ids ]

In [50]:
song_ids_sequence[:5]

['711', '541', '224', '103', '36']

### Train Word2Vec Model

In [53]:
model = Word2Vec([song_ids_sequence], min_count=1, size=75, window=3, sg=1)

In [54]:
print(model)

Word2Vec(vocab=830, size=75, alpha=0.025)


In [55]:
items = list(model.wv.vocab)

In [64]:
# test
# generate random song
test_song = random.choice(counter.most_common(n=75))[0]
test_id = song_to_id[test_song]
# test_id = random.randint(0, len(items))
song = id_to_song[test_id]
test_id, song, counter[song]

(629, 'the endless river | 14 talkin 39 hawkin 39 pink floyd mp3', 15)

In [65]:
similar = model.wv.most_similar(positive=[str(test_id)], topn=10)

In [66]:
for idx, val in similar:
    print(id_to_song[int(idx)])

the endless river | 09 on noodle street pink floyd mp3
the endless river | 16 eyes to pearls pink floyd mp3
the endless river | 10 night light pink floyd mp3
iron maiden phantom of the opera studio version
the endless river | 06 unsung pink floyd mp3
bring me the horizon mantra official audio
the endless river | 11 allonsy 1 pink floyd mp3
gary jules mad world song + lyrics
iron maidendeja vu
devin townsend project failure album track
