##This notebook was used to create the album embeddings and train the corresponding KNN model

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%%capture
!pip install pyrdf2vec rdflib

In [None]:
path = '/content/gdrive/MyDrive/SWT/'
save_path = '/content/gdrive/MyDrive/SWT/final_data/final'

In [None]:
%%capture
import pandas as pd
import rdflib
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

###Load in KG and create embeddings

In [None]:
transformer = RDF2VecTransformer(
    Word2Vec(epochs=10),
    walkers=[RandomWalker(4, 10, with_reverse=False, n_jobs=2)],
    verbose=1
)

In [None]:
kg = KG(
        path + "album.ttl",
        fmt='turtle',
        skip_predicates={
        "http://ns.inria.fr/wasabi/ontology/iTunes_page",
        "http://purl.org/ontology/mo/musicbrainz_guid",
        "http://purl.org/ontology/mo/discogs",
        "http://ns.inria.fr/wasabi/ontology/amazon_page",
        "http://purl.org/ontology/mo/musicbrainz",
        "http://schema.org/barcode",
        "http://ns.inria.fr/wasabi/ontology/discogs_id",
        "http://ns.inria.fr/wasabi/ontology/allMusic_page",
        "http://ns.inria.fr/wasabi/ontology/deezer_page",
        "http://ns.inria.fr/wasabi/ontology/deezer_album_id",
        "http://purl.org/ontology/mo/upc",
        "http://purl.org/ontology/mo/homepage",
        "http://purl.org/ontology/mo/uuid",
        "http://purl.org/ontology/mo/wikipedia",
        "http://ns.inria.fr/wasabi/ontology/spotify_page",
        },
        literals = [
        ["http://purl.org/dc/terms/title"],
        ["http://purl.org/ontology/mo/genre"],
        ["http://purl.org/ontology/mo/performer"],
        ["http://purl.org/dc/terms/language"],
        ["http://schema.org/location"],
        ["http://ns.inria.fr/wasabi/ontology/has_explicit_lyrics"]
    ],
)

In [None]:
data = pd.read_csv(path + "album_entities.csv")
entities = data[data.columns[0]].values.tolist()

In [None]:
embeddings, literals = transformer.fit_transform(
    kg,
    entities
)

100%|██████████| 208742/208742 [04:23<00:00, 793.48it/s]


Extracted 1497945 walks for 208742 entities (263.7932s)
Fitted 1497945 walks (55.5794s)


100%|██████████| 208742/208742 [00:21<00:00, 9716.55it/s]

Extracted 208742 literals for 208742 entities (21.4883s)





###Save embeddings and entities as .npy files

In [None]:
import numpy as np

In [None]:
import pickle

In [None]:
embeddings_array = np.array(embeddings)
output_file_path = save_path + "album_embeddings.npy"
np.save(output_file_path, embeddings_array)

In [None]:
literals_array = np.array(literals)
output_file_path = save_path + "album_literals.npy"
np.save(output_file_path, literals_array)

In [None]:
np_ents = np.array(entities)
np.save(save_path + 'album_entities.npy', np_ents)

###Train and save KNN model

In [None]:
from sklearn.neighbors import NearestNeighbors

embeddings = np.load(save_path + 'album_embeddings.npy')

n_neighbors = 5
knn_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute', metric='cosine')
knn_model.fit(embeddings)

In [None]:
%%capture
!pip install joblib

In [None]:
import joblib

# Save the NearestNeighbors model to a file
model_filename = save_path + 'album_knn_model.pkl'
joblib.dump(knn_model, model_filename)