In [10]:
import ampligraph
from ampligraph.datasets import load_from_ntriples, load_from_rdf
from ampligraph.evaluation import train_test_split_no_unseen
from ampligraph.latent_features import ComplEx, DistMult, TransE, save_model, restore_model
from ampligraph.evaluation import evaluate_performance, mrr_score, hits_at_n_score, mr_score

from rdflib.util import guess_format
from rdflib import Graph, URIRef, ConjunctiveGraph, Literal, BNode
from rdflib.namespace import RDF, OWL, RDFS
import rdflib

import requests
import urllib.request
import os

import numpy as np
import pandas as pd
from scipy import stats

import pickle

np.set_printoptions(threshold=100)

In [11]:
all_graphs = pd.read_csv("lov_uri-domain-version - lov_uri-domain-version.csv", sep=",")
all_graphs.head()

Unnamed: 0,vocabPrefix,domain,uri version
0,SAN,IoT,https://lov.linkeddata.es/dataset/lov/vocabs/S...
1,SAN,Support,https://lov.linkeddata.es/dataset/lov/vocabs/S...
2,acco,eBusiness,https://lov.linkeddata.es/dataset/lov/vocabs/a...
3,acl,API,https://lov.linkeddata.es/dataset/lov/vocabs/a...
4,acm,Catalogs,https://lov.linkeddata.es/dataset/lov/vocabs/a...


In [12]:
grouped = all_graphs.groupby(["vocabPrefix", "uri version"])["domain"].apply(lambda x: ','.join(x)).reset_index()
grouped.describe()

Unnamed: 0,vocabPrefix,uri version,domain
count,661,661,661
unique,661,660,121
top,rami,https://lov.linkeddata.es/dataset/lov/vocabs/l...,Metadata
freq,1,2,40


In [13]:
Y = grouped.domain.values
vocabs = grouped.vocabPrefix.values
Urls = grouped.iloc[:,1].values
print(len(Urls))
#print(Urls)
vocabs

661


array(['SAN', 'acco', 'acl', ..., 'xhv', 'xkos', 'zbwext'], dtype=object)

In [14]:
# The triplets containing these predicates have to be removed (they are for metadata or are not relevant information)
TEXT_SUFFIXES = ["comment", "description", "label", "definition"]
#TEXT_SUFFIXES = ["comment", "description", "definition"]
#TEXT_SUFFIXES = ["comment", "description"]

# From a rdflib graph, concatenate all rdfs:comments in a string
def process(rdflib_graph):
    full_text = ""
    
    for s,p,o in rdflib_graph:
        # remove literals
        #if type(o) != rdflib.term.Literal:
        suf = rdflib_graph.compute_qname(p)[2]
        if suf in TEXT_SUFFIXES:
            text = str(o)
            if len(text) > 0 and text[-1] != ".":
                text += "."
            full_text += text
            full_text += " "
        
        #print(str(o))
        #print(text)
            
    return full_text

In [15]:
all_texts = []
inds_to_remove = []

for i,(url,vocab) in enumerate(zip(Urls, vocabs)):
    if i % 100 == 0:
        print(i)
    # Load graphs from url
    try:
#         Download the file from `url`, save it in a temporary directory and get the
#         path to it (e.g. '/tmp/tmpb48zma.txt') in the `file_name` variable:
#         file_name, headers = urllib.request.urlretrieve(url)
        
        # Download the graph file from url, if the file does not exist
        path = "vocabs/" + vocab
        if not(os.path.isfile(path)):
            urllib.request.urlretrieve(url, path)
            
        
        # Load rdflib graph from n3 file
        g = Graph()
        data = open(path, "rb")
        g.parse(data, format="n3")
        
        # Extract all text from graph
        text = process(g)
        all_texts.append(text)
    except Exception as e:
        # If graph cannot be loaded, refresh arrays of vocabs and tags
        inds_to_remove.append(i)
        print(e, url)

0
at line 2179 of <>:
Bad syntax (objectList expected) at ^ in:
"...b'lGlassBottle ,\n    container:640mlPlasticBottle ,\n    contai'^b'ner:1_8lPlasticBottle ;\n  rdfs:seeAlso <http://ko.wikipedia.'..." https://lov.linkeddata.es/dataset/lov/vocabs/bevon/versions/2015-07-23.n3
at line 39 of <>:
Bad syntax (objectList expected) at ^ in:
"...b's4:Concept;\n    ns4:prefLabel "Regio"@nl;\n    ns4:definition'^b" '''De gemeenten in Nederland zijn onderverdeeld in wijken e"..." https://lov.linkeddata.es/dataset/lov/vocabs/cbs/versions/2018-05-02.n3
100
200
300
400
500
600


In [16]:
# Remove unloaded graphs in labels arrays
vocabs2 = np.delete(vocabs,inds_to_remove,0)
Y2 = np.delete(Y,inds_to_remove,0)
assert len(all_texts) == len(vocabs2) == len(Y2)

In [17]:
all_texts_length = [len(st) for st in all_texts]
stats.describe(all_texts_length)

DescribeResult(nobs=659, minmax=(0, 1002586), mean=15896.468892261002, variance=3533422744.006249, skewness=12.460220705192233, kurtosis=186.43158015783698)

In [18]:
inds_len0 = np.where(np.array(all_texts_length) == 0)
inds_len0

(array([201, 248, 269]),)

In [19]:
# 3 vocabs n'ont pas de textes dans leur n3. inno est un fichier vide et gci et keys n'ont pas de commentaires/labels
print(vocabs2[201],vocabs2[248],vocabs2[269])

gci inno keys


In [20]:
# Delete vocabs with no text at all
for ind in inds_len0[0]:
    del all_texts[ind]
vocabs3 = np.delete(vocabs2,inds_len0[0],0)
Y3 = np.delete(Y2,inds_len0[0],0)

assert len(all_texts) == len(vocabs3) == len(Y3)

In [21]:
def process_multilabel(Y):
    Y_out = []
    for y in Y:
        labels = y.split(",")
        Y_out.append(labels)
    return Y_out

In [22]:
Y_final = process_multilabel(Y3)

In [32]:
print(vocabs3[18])
print(all_texts[18])

aos
Simple Skin Color Albino. Gender (Generic). Blue Gray Iris (5). Has Thigh Measurement in Meters. Blond. Von Luschan Skin Color 22. Caribbean. Light Blue Iris (1c). Unknown. White. Frau. Mediterranean European/Hispanic. Dark Blond (Human Hair Atlas). Von Luschan Skin Color 10. Has Thigh Measurement in Inches. Has Sex. Not Available. Blue. Freckles. White. Gender Unknown. Medium to Dark Red Brown (Human Hair Atlas). htmlColor. Blue. Yellow Brown and Brown Green (8). Has Over Arm Measurement in Inches. Von Luschan Skin Color 31. Von Luschan Skin Color 7. Fair. Medium to Dark Gray Brown, Black (Human Hair Atlas). Dark to Opaque Brown (Human Hair Atlas). Medium. Grey. Sandy. Medium to Dark Red (Human Hair Atlas). Gray (6). Has Pant Length Measurement. Brown (ICCS #4). Light Brown with Yellow Specks (ICCS #3). Dyed Hair Color. Green. White and Black Caribbean. Natural Hair Color. Simple Eye Color Reference. Pétrequin Eye Color Reference. Grey/Gray. Has Inseam Measurement in Meters. Brown

In [14]:
import pickle

# Save data for classification
with open('DATA.pkl', 'wb') as handle:
    pickle.dump((all_texts, Y_final, vocabs3), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [1]:
############# PROCESS NEW VOCABS ################

In [6]:
import os
dir_path = "./new_vocabs"
files = os.listdir(dir_path)
files

['UsabilityOntology.rdf',
 'cultural-event.owl',
 'munc.owl',
 'catalogue.owl',
 'terms.ttl',
 'denotative-description.owl',
 'context-description.owl',
 'ontology.ttl',
 'location.owl',
 'core.owl',
 'vir.ttl',
 'arco.owl']

In [7]:
texts = []
vocabNames = []
for f in files:
    try:
        path = dir_path + "/" + f

        g = Graph()
        data = open(path, "rb")
        if f[-3:] == "ttl":
            g.parse(file=data, format="n3")
        else:
            g.parse(file=data)
        
        # Extract all text from graph
        text = process(g)
        texts.append(text)
        vocabNames.append(f)
        
    except Exception as e:
        print(e, f)

In [8]:
vocabNames
#texts[0]

['UsabilityOntology.rdf',
 'cultural-event.owl',
 'munc.owl',
 'catalogue.owl',
 'terms.ttl',
 'denotative-description.owl',
 'context-description.owl',
 'ontology.ttl',
 'location.owl',
 'core.owl',
 'vir.ttl',
 'arco.owl']

In [9]:
# Save data for classification
with open('newDATA.pkl', 'wb') as handle:
    pickle.dump((texts, vocabNames), handle, protocol=pickle.HIGHEST_PROTOCOL)