In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip /content/drive/MyDrive/music_recommendation/Data.zip

Archive:  /content/drive/MyDrive/music_recommendation/Data.zip
  inflating: test_data.pkl           
  inflating: test_label.pkl          
  inflating: train_data.pkl          
  inflating: train_label.pkl         


In [3]:
!pip install transformers  # text processing

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 8.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.9 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 54.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [4]:
from tensorflow import keras
from transformers import TFAutoModel, AutoTokenizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle


def get_survey_model(hidden_dim=256, out_dim=128):
    """
    input shape: (102, 56) 
    first layer (102, 56) @ (56, 256) -> (102, 256)
    second layer (102, 256) @ (256, 256) -> (102, 256)
    third layer (102, 256) @ (256, 256) -> (102, 256)
    final layer (102, 256) @ (256, 128) -> (102, 128)
    """
    model = keras.Sequential()
    # input: [102, 56]
    model.add(keras.layers.Dense(hidden_dim, activation='relu'))
    # first layer: we have a matrix of shape (56, hidden_dim)
    # output shape: (102, hidden_dim)
    model.add(keras.layers.Dense(hidden_dim, activation='relu'))
    # 2nd layer: we have a matrix of shape (hidden_dim, hidden_dim)
    # output shape: (102, hidden_dim)
    model.add(keras.layers.Dense(hidden_dim, activation='relu'))
    # 3rd layer: we have a matrix of shape (hidden_dim, 512)
    # output shape: (102, 512)
    model.add(keras.layers.Dense(out_dim, activation='relu'))
    # 4th layer: we have a matrix of shape (512, 768)
    # output shape: (102, 768)

    return model


def get_text_model():
    model = TFAutoModel.from_pretrained('distilbert-base-uncased') # [102, 128]
    return model

def get_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

    # "I like to eat hambergers" -> [I, like, to, eat, hamburgers] -> [1, 100, 200,]
    return tokenizer


class MusicRecommendationModel(keras.Model):

    def __init__(self, hidden_dim, database=None):
        super().__init__()
        self.text_model = get_text_model()
        self.tokenizer = get_tokenizer()
        self.survey_model = get_survey_model(hidden_dim, self.text_model.config.hidden_size)
        # what the keras.layers.Dot do:
        # takes two inputs: a, b
        #    a: [N, D]
        #    b: [N, D]
        # returns the dot product of a and b

        # output shape: [N, 1]
        # all the numbers will be between -1 ~ 1
        #    this is because we have normalize=True option

        # this layer calculates cosine similarity of the two input vectors
        # (cosine similarity of survey vector and lyric vector)
        self.similarity = keras.layers.Dot(-1, normalize=True)
        self.database = database

    def call(self, inputs):
        # train_survey = [N, K]
        # train_lyrics = [N, T]
        # 1. putting the survey result into survey model => get survey vector
        survey_vector = self.survey_model(inputs['survey'])  # [102, 128]

        # 2. get the lyric -> put it in text model -> get text vector (embedding)
        lyric_embedding = self.text_model(
            **{k:v for k,v in inputs.items() if k != 'survey'})
        lyric_vector = lyric_embedding.last_hidden_state[:, 0, :]  # [102, 128]

        # 3. check cosine similarity of survey vector and lyric vector
        similarity = self.similarity([lyric_vector, survey_vector])  # all the numbers are between -1 ~ 1, shape: [102, 1]
        return similarity
    
    def recommend(self, survey, recommendations=5):
        if self.database is None:
            print('database not initialized yet')
            return
        survey_vector = self.survey_model(survey).numpy()  # [1, E]
        metadata = self.database.search(survey_vector, topk=recommendations)
        return metadata
    
    def cache_database(self, all_songs): #allsongs is pandaa's datadrame
        embeddings = []
        metadata = []
        for i in range(len(all_songs)): #len(all_lyrics) = N
          song = all_songs.iloc[i]
          lyrics = song['lyrics']
          _metadata = song[['uri', 'name']] #song uri, song name
          tokenized_lyrics = self.tokenizer(lyrics, 
                                      padding=True, 
                                      truncation=True,
                                      max_length=128,  # the number of words in the lyric
                                      return_tensors='tf')
          emb = self.text_model(**tokenized_lyrics).last_hidden_state[:, 0, :]
          embeddings.append(emb.numpy().squeeze()) #tensorflow to numpy and sqeeuze to remove dimensions and leave as a vector
          metadata.append(_metadata)
        
        embeddings = np.stack(embeddings, 0) #[N,768]

        self.database = Database(metadata, embeddings)
        self.database.save('database.pkl') # save to file



class Database:

    def __init__(self, metadata, embeddings):
        self.metadata = metadata
        self.embeddings = embeddings

    def search(self, vector, topk=5):
        # TODO: current implementation is a naive for loop approach.
        similarities = []
        for instance in self.embeddings:
            similarities.append(
                cosine_similarity(vector.reshape(1, -1), instance.reshape(1, -1)))

        similarities = np.array(similarities)
        indices = similarities.argsort()[::-1]  # highest sim to lowest

        songs = []
        for idx in indices[:topk]:
            songs.append(self.metadata[idx])

        return songs

    def save(self, filename):
      #in python, there is something called pickle, save virtually anything
      with open(filename, 'wb') as f:
        pickle.dump(self, f)
      #save the database into somewhere


## Load Data

In [5]:
import pickle


with open('test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)

with open('train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)

with open('test_label.pkl', 'rb') as f:
    test_label = pickle.load(f)

with open('train_label.pkl', 'rb') as f:
    train_label = pickle.load(f)


In [6]:
train_surveys = []
train_lyrics = []
for data in train_data:
    survey_array = data[0]
    lyric = data[1]
    train_surveys.append(survey_array)
    train_lyrics.append(lyric)

train_surveys = np.stack(train_surveys, axis=0)

test_surveys = []
test_lyrics = []
for data in test_data:
    survey_array = data[0]
    lyric = data[1]
    test_surveys.append(survey_array)
    test_lyrics.append(lyric)

test_surveys = np.stack(test_surveys, axis=0)

In [7]:
train_label = np.array(train_label)
test_label = np.array(test_label)

In [8]:
train_surveys.shape, train_label.shape, len(train_lyrics)

((41942, 65), (41942,), 41942)

In [9]:
test_surveys.shape, test_label.shape, len(test_lyrics)

((10486, 65), (10486,), 10486)

In [10]:
import math
from tensorflow.keras.utils import Sequence

class Dataloader(Sequence):

    def __init__(self, surveys, lyrics, labels, tokenizer, batch_size, total_size=None, shuffle=False):
        self.surveys, self.lyrics, self.labels = surveys, lyrics, labels
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.total_size = total_size
        self.on_epoch_end()  # shuffle

    def __len__(self):
        if self.total_size is not None:
            return math.ceil(self.total_size / self.batch_size)
        return math.ceil(len(self.surveys) / self.batch_size)


    def __getitem__(self, idx):
        # sampler의 역할(index를 batch_size만큼 sampling해줌)
        indices = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]

        batch_surveys = self.surveys[indices]
        batch_labels = self.labels[indices]

        batch_lyrics = [self.lyrics[i] for i in indices]
        batch_lyrics = self.tokenizer(batch_lyrics, 
                                      padding=True, 
                                      truncation=True,
                                      max_length=128,  # the number of words in the lyric
                                      return_tensors='tf')
        return ({'survey': np.array(batch_surveys), **batch_lyrics},
                np.array(batch_labels))

    def on_epoch_end(self):
        self.indices = np.arange(len(self.surveys))
        if self.shuffle == True:
            np.random.shuffle(self.indices)


## Train

In [11]:
hidden_dims = [64, 128, 256, 512, 1024]
#learning_rates = [1e-7, 0.001]  # follow powers of 10 or just doubles, 0.5 ->1 -> 5 -> 10
learning_rates = [1e-5,5e-5]
batch_sizes = [8, 16, 32]

In [17]:
import os

for h in hidden_dims:
    for lr in learning_rates:
        for bs in batch_sizes:
            save_name = '/content/drive/MyDrive/music_recommendation/checkpoints/ckpt2' + f'hidden-{h}_lr-{lr}_bs-{bs}'

            if os.path.exists(save_name + '.index'):
                continue

            model = MusicRecommendationModel(h)

            model.compile(optimizer=keras.optimizers.Adam(lr=lr),
                            loss=keras.losses.MeanSquaredError(),
                            metrics=None)
            
            train_loader = Dataloader(train_surveys, train_lyrics, train_label, model.tokenizer, bs, total_size=10000, shuffle=True)
            test_loader = Dataloader(test_surveys, test_lyrics, test_label, model.tokenizer, bs, total_size=2000)

            save_name = '/content/drive/MyDrive/music_recommendation/checkpoints/ckpt2' + f'hidden-{h}_lr-{lr}_bs-{bs}'
            callback = keras.callbacks.ModelCheckpoint(save_name, save_weights_only=True)

            hist = model.fit(train_loader, epochs=2, callbacks=[callback])

            # test the model
            print(h, lr, bs, model.evaluate(test_loader))


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_layer_norm', 'vocab_projector', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


KeyboardInterrupt: ignored

In [None]:
a = 23
f'asdfa sdfasdfasdfa sdfa sdfa sdf {a}'

'asdfa sdfasdfasdfa sdfa sdfa sdf 23'

Prepare Database

In [20]:
#1. load the trained model from the checkpoint

model = MusicRecommendationModel(1024)


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_layer_norm', 'vocab_projector', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [21]:

model.load_weights('/content/drive/MyDrive/music_recommendation/checkpoints/ckpt2hidden-1024_lr-5e-05_bs-8'
) #path to the model name

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fcef0118d10>

In [22]:
#2. load all songs from all_songs.csv

import pandas as pd

all_songs = pd.read_csv('/content/drive/MyDrive/music_recommendation/all_songs.csv')

In [23]:
#3. cache the database

model.cache_database(all_songs)

In [24]:
test_loader = Dataloader(test_surveys, test_lyrics, test_label, model.tokenizer, 32, total_size=2000)
model(test_loader[0][0])
model.survey_model.save_weights('/content/drive/MyDrive/music_recommendation/checkpoints/survey_best')