In [21]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%%capture
!pip install pyrdf2vec rdflib

In [16]:
path = '/content/gdrive/MyDrive/SWT/'
save_path = '/content/gdrive/MyDrive/SWT/final_data/'

In [3]:
%%capture
import pandas as pd
import rdflib
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

In [4]:
transformer = RDF2VecTransformer(
    Word2Vec(epochs=10),
    walkers=[RandomWalker(4, 10, with_reverse=False, n_jobs=2)],
    verbose=1
)

In [None]:
kg = KG(
        path + "album.ttl",
        fmt='turtle',
        skip_predicates={
        "http://ns.inria.fr/wasabi/ontology/iTunes_page",
        "http://purl.org/ontology/mo/musicbrainz_guid",
        "http://purl.org/ontology/mo/discogs",
        "http://ns.inria.fr/wasabi/ontology/amazon_page",
        "http://schema.org/disambiguatingDescription",
        "http://purl.org/ontology/mo/musicbrainz",
        "http://schema.org/datePublished",
        "http://schema.org/barcode",
        "http://ns.inria.fr/wasabi/ontology/discogs_id",
        "http://ns.inria.fr/wasabi/ontology/allMusic_page",
        "http://ns.inria.fr/wasabi/ontology/deezer_page",
        "http://ns.inria.fr/wasabi/ontology/deezer_album_id",
        "http://purl.org/ontology/mo/upc",
        "http://ns.inria.fr/wasabi/ontology/album_length",
        "http://purl.org/ontology/mo/homepage",
        "http://purl.org/ontology/mo/uuid",
        "http://purl.org/ontology/mo/wikipedia",
        "http://ns.inria.fr/wasabi/ontology/spotify_page",
        },
        literals = [
        ["http://purl.org/dc/terms/title"],
        ["http://purl.org/ontology/mo/genre"],
        ["http://schema.org/releaseDate"],
        ["http://purl.org/ontology/mo/performer"],
        ["http://purl.org/dc/terms/language"],
        ["http://schema.org/location"],
        ["http://ns.inria.fr/wasabi/ontology/deezer_fans"],
        ["http://ns.inria.fr/wasabi/ontology/has_explicit_lyrics"]
    ],
)

In [22]:
data = pd.read_csv(path + "album_entities.csv")
entities = data[data.columns[0]].values.tolist()

In [7]:
embeddings, literals = transformer.fit_transform(
    kg,
    entities
)

100%|██████████| 208742/208742 [03:52<00:00, 897.24it/s]


Extracted 1339461 walks for 208742 entities (233.2740s)
Fitted 1339461 walks (48.0241s)


100%|██████████| 208742/208742 [00:22<00:00, 9479.28it/s]

Extracted 208742 literals for 208742 entities (22.0251s)





In [10]:
import numpy as np

In [11]:
import pickle

In [17]:
embeddings_array = np.array(embeddings)
output_file_path = save_path + "album_embeddings.npy"
np.save(output_file_path, embeddings_array)

In [18]:
literals_array = np.array(literals)
output_file_path = save_path + "album_literals.npy"
np.save(output_file_path, literals_array)

In [70]:
from sklearn.neighbors import NearestNeighbors

embeddings = np.load(path + 'embeddings.npy')

n_neighbors = 5
knn_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute', metric='cosine')
knn_model.fit(embeddings)

Indices of Nearest Neighbors: [[  3454  99478 192706 192958   9579]]
Distances to Nearest Neighbors: [[1.1920929e-07 3.2163680e-02 3.2301426e-02 3.6226273e-02 3.6351860e-02]]
<class 'numpy.ndarray'>


In [None]:
import numpy as np

In [72]:
for i in indices[0]:
  print(entities[i])

http://ns.inria.fr/wasabi/album/5714debb25ac0d8aee35383e
http://ns.inria.fr/wasabi/album/5714debb25ac0d8aee36085b
http://ns.inria.fr/wasabi/album/5714debb25ac0d8aee35618b
http://ns.inria.fr/wasabi/album/5714debb25ac0d8aee356f2c
http://ns.inria.fr/wasabi/album/5714debe25ac0d8aee37119c


In [23]:
np_ents = np.array(entities)
np.save(save_path + 'album_entities.npy', np_ents)