##This notebook was used to create the artist embeddings and train the corresponding KNN model

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%%capture
!pip install pyrdf2vec rdflib

In [None]:
path = '/content/gdrive/MyDrive/SWT/'
save_path = '/content/gdrive/MyDrive/SWT/output/'

In [None]:
%%capture
import pandas as pd
import rdflib
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker

###Load in KG and create embeddings

In [None]:
transformer = RDF2VecTransformer(
    Word2Vec(epochs=10),
    walkers=[RandomWalker(4, 10, with_reverse=False, n_jobs=2)],
    verbose=1
)

In [None]:
kg = KG(
        path + "artist.ttl",
        fmt='turtle',
        skip_predicates = {
            "http://xmlns.com/foaf/0.1/name",
            "http://ns.inria.fr/wasabi/ontology/iTunes_page",
            "http://purl.org/ontology/mo/musicbrainz_guid",
            "http://purl.org/ontology/mo/discogs",
            "http://ns.inria.fr/wasabi/ontology/secondHandSongs_page",
            "http://ns.inria.fr/wasabi/ontology/amazon_page",
            "http://ns.inria.fr/wasabi/ontology/name_without_accent",
            "http://schema.org/disambiguatingDescription",
            "http://ns.inria.fr/wasabi/ontology/wikidata_page",
            "http://purl.org/ontology/mo/musicbrainz",
            "http://ns.inria.fr/wasabi/ontology/musicbrainz_id",
            "http://ns.inria.fr/wasabi/ontology/BBC_page",
            "http://ns.inria.fr/wasabi/ontology/instagram_page",
            "http://ns.inria.fr/wasabi/ontology/discogs_id",
            "http://purl.org/ontology/mo/myspace",
            "http://ns.inria.fr/wasabi/ontology/twitter_page",
            "http://ns.inria.fr/wasabi/ontology/allMusic_page",
            "http://ns.inria.fr/wasabi/ontology/deezer_page",
            "http://ns.inria.fr/wasabi/ontology/deezer_artist_id",
            "http://ns.inria.fr/wasabi/ontology/soundCloud_page",
            "http://ns.inria.fr/wasabi/ontology/pureVolume_page",
            "http://purl.org/ontology/mo/homepage",
            "http://ns.inria.fr/wasabi/ontology/lastFm_page",
            "http://ns.inria.fr/wasabi/ontology/googlePlus_page",
            "http://ns.inria.fr/wasabi/ontology/youTube_page",
            "http://purl.org/ontology/mo/uuid",
            "http://ns.inria.fr/wasabi/ontology/rateYourMusic_page",
            "http://purl.org/ontology/mo/wikipedia",
            "http://ns.inria.fr/wasabi/ontology/wikia_page",
            "http://ns.inria.fr/wasabi/ontology/spotify_page",
            "http://ns.inria.fr/wasabi/ontology/facebook_page"
        },
        literals = [
        ['http://www.w3.org/2000/01/rdf-schema#label'],
        ['http://dbpedia.org/ontology/abstract'],
        ['http://dbpedia.org/ontology/genre'],
        ['http://ns.inria.fr/wasabi/ontology/location'],
        ['http://ns.inria.fr/wasabi/ontology/record_label'],
        ['http://purl.org/dc/terms/subject'],
        ['http://xmlns.com/foaf/0.1/gender'],
        ['http://ns.inria.fr/wasabi/ontology/city'],
        ['http://dbpedia.org/ontology/associatedMusicalArtist'],
        ['http://ns.inria.fr/wasabi/ontology/country']
    ],
)

In [None]:
data = pd.read_csv(path + "artist_entities.csv")
entities = data[data.columns[0]].values.tolist()

In [None]:
embeddings, literals = transformer.fit_transform(
    kg,
    entities
)

100%|██████████| 77491/77491 [02:05<00:00, 615.16it/s]


Extracted 584390 walks for 77491 entities (126.4480s)
Fitted 584390 walks (26.8101s)


100%|██████████| 77491/77491 [00:14<00:00, 5513.72it/s]

Extracted 77491 literals for 77491 entities (14.0602s)





###Save embeddings

In [None]:
import numpy as np

In [None]:
import pickle

In [None]:
embeddings_array = np.array(embeddings)
output_file_path = save_path + "artist_embeddings.npy"
np.save(output_file_path, embeddings_array)

In [None]:
literals_array = np.array(literals)
output_file_path = save_path + "artist_literals.npy"
np.save(output_file_path, literals_array)

  literals_array = np.array(literals)


###Train and save KNN model

In [None]:
from sklearn.neighbors import NearestNeighbors

embeddings = np.load(save_path + 'artist_embeddings.npy')

n_neighbors = 5
knn_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute', metric='cosine')
knn_model.fit(embeddings)

In [None]:
%%capture
!pip install joblib

In [None]:
import joblib

# Save the NearestNeighbors model to a file
model_filename = save_path + 'artist_knn_model.pkl'
joblib.dump(knn_model, model_filename)

In [None]:
np_ents = np.array(entities)

In [None]:
np.save(save_path + 'artist_entities.npy', np_ents)