In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
%%capture
!pip install pyrdf2vec rdflib

In [7]:
path = '/content/gdrive/MyDrive/SWT/'
save_path = '/content/gdrive/MyDrive/SWT/final_data/'

In [8]:
%%capture
import pandas as pd
import rdflib
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

In [9]:
transformer = RDF2VecTransformer(
    Word2Vec(epochs=10),
    walkers=[RandomWalker(4, 10, with_reverse=False, n_jobs=2)],
    verbose=1
)

In [10]:
kg = KG(
        path + "artist.ttl",
        fmt='turtle',
        skip_predicates = {
            "http://xmlns.com/foaf/0.1/name",
            "http://ns.inria.fr/wasabi/ontology/iTunes_page",
            "http://purl.org/ontology/mo/musicbrainz_guid",
            "http://purl.org/ontology/mo/discogs",
            "http://ns.inria.fr/wasabi/ontology/secondHandSongs_page",
            "http://ns.inria.fr/wasabi/ontology/amazon_page",
            "http://ns.inria.fr/wasabi/ontology/name_without_accent",
            "http://schema.org/disambiguatingDescription",
            "http://ns.inria.fr/wasabi/ontology/wikidata_page",
            "http://purl.org/ontology/mo/musicbrainz",
            "http://ns.inria.fr/wasabi/ontology/musicbrainz_id",
            "http://purl.org/dc/terms/abstract",
            "http://ns.inria.fr/wasabi/ontology/BBC_page",
            "http://ns.inria.fr/wasabi/ontology/instagram_page",
            "http://ns.inria.fr/wasabi/ontology/discogs_id",
            "http://purl.org/ontology/mo/myspace",
            "http://ns.inria.fr/wasabi/ontology/twitter_page",
            "http://ns.inria.fr/wasabi/ontology/allMusic_page",
            "http://ns.inria.fr/wasabi/ontology/deezer_page",
            "http://ns.inria.fr/wasabi/ontology/deezer_artist_id",
            "http://ns.inria.fr/wasabi/ontology/soundCloud_page",
            "http://schema.org/genre",
            "http://ns.inria.fr/wasabi/ontology/pureVolume_page",
            "http://www.w3.org/2002/07/owl#sameAs",
            "http://purl.org/ontology/mo/homepage",
            "http://ns.inria.fr/wasabi/ontology/lastFm_page",
            "http://ns.inria.fr/wasabi/ontology/googlePlus_page",
            "http://ns.inria.fr/wasabi/ontology/youTube_page",
            "http://purl.org/ontology/mo/uuid",
            "http://ns.inria.fr/wasabi/ontology/rateYourMusic_page",
            "http://purl.org/ontology/mo/wikipedia",
            "http://ns.inria.fr/wasabi/ontology/wikia_page",
            "http://ns.inria.fr/wasabi/ontology/spotify_page",
            "http://ns.inria.fr/wasabi/ontology/facebook_page"
        },
        literals = [
        ['http://www.w3.org/2000/01/rdf-schema#label'],
        # ['http://dbpedia.org/ontology/abstract'],
        ['http://dbpedia.org/ontology/genre'],
        ['http://ns.inria.fr/wasabi/ontology/location'],
        ['http://ns.inria.fr/wasabi/ontology/record_label'],
        ['http://purl.org/dc/terms/subject'],
        ['http://schema.org/birthDate'],
        ['http://xmlns.com/foaf/0.1/gender'],
        ['http://ns.inria.fr/wasabi/ontology/city'],
        ['http://ns.inria.fr/wasabi/ontology/deezer_fans'],
        ['http://dbpedia.org/ontology/associatedMusicalArtist'],
        ['http://schema.org/foundingDate'],
        ['http://schema.org/dissolutionDate'],
        ['http://ns.inria.fr/wasabi/ontology/country']
    ],
)

In [11]:
data = pd.read_csv(path + "artist_entities.csv")
entities = data[data.columns[0]].values.tolist()

In [12]:
embeddings, literals = transformer.fit_transform(
    kg,
    entities
)

100%|██████████| 77491/77491 [01:55<00:00, 672.44it/s]


Extracted 557491 walks for 77491 entities (115.6691s)
Fitted 557491 walks (23.4363s)


100%|██████████| 77491/77491 [00:13<00:00, 5654.30it/s]

Extracted 77491 literals for 77491 entities (13.7110s)





In [13]:
import numpy as np

In [14]:
import pickle

In [15]:
embeddings_array = np.array(embeddings)
output_file_path = save_path + "artist_embeddings_noab.npy"
np.save(output_file_path, embeddings_array)

In [16]:
literals_array = np.array(literals)
output_file_path = save_path + "artist_literals_noab.npy"
np.save(output_file_path, literals_array)

  literals_array = np.array(literals)


In [20]:
from sklearn.neighbors import NearestNeighbors

embeddings = np.load(save_path + 'artist_embeddings_noab.npy')

n_neighbors = 5
knn_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute', metric='cosine')
knn_model.fit(embeddings)

Indices of Nearest Neighbors: [[47083 59491 42117 11522 73101]]
Distances to Nearest Neighbors: [[0.         0.05574459 0.05666602 0.07660633 0.08123285]]
<class 'numpy.ndarray'>


In [21]:
for i in indices[0]:
  print(entities[i])

http://ns.inria.fr/wasabi/artist/56d8258553a7ddfc01f941f7
http://ns.inria.fr/wasabi/artist/56d850dd53a7ddfc01f97f7f
http://ns.inria.fr/wasabi/artist/56d80f2b53a7ddfc01f92234
http://ns.inria.fr/wasabi/artist/56d81f1c53a7ddfc01f93867
http://ns.inria.fr/wasabi/artist/56d9659dcc2ddd0c0f6ba833


In [17]:
%%capture
!pip install joblib

In [22]:
import joblib

# Save the NearestNeighbors model to a file
model_filename = save_path + 'knn_model.pkl'
joblib.dump(knn_model, model_filename)

In [25]:
np_ents = np.array(entities)

In [26]:
np.save(save_path + 'artist_entities.npy', np_ents)