In [58]:
import pandas as pd
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker
import rdflib
from pykeen.pipeline import pipeline
from pyrdf2vec.samplers import ( 
    ObjFreqSampler,
    ObjPredFreqSampler,
    PageRankSampler,
    PredFreqSampler,
    UniformSampler
)

In [60]:
# Read a CSV file containing the entities we want to classify.
traindata = pd.read_csv("../data/trainingSet.tsv", sep="\t") # train und test zusammen
testdata = pd.read_csv("../data/trainingSet.tsv", sep="\t")
kg = KG("../data/aifb_witho_complete.ttl")
data = traindata.append(testdata)
entities = [entity for entity in traindata["person"]]


  data = traindata.append(testdata)


In [12]:
samplers = [
    ("Uniform", UniformSampler()),
    ("Object Frequency", ObjFreqSampler()),
    ("Inverse Object Frequency", ObjFreqSampler(inverse=True)),
    (
        "Inverse Object Frequency Split",
        ObjFreqSampler(inverse=True, split=True),
    ),
    ("Predicate Frequency", PredFreqSampler()),
    ("Inverse Predicate Frequency", PredFreqSampler(inverse=True)),
    ("Predicate + Object Frequency", ObjPredFreqSampler()),
    ("Inverse Predicate + Object Frequency", ObjPredFreqSampler(inverse=True)),
    ("PageRank", PageRankSampler()),
    ("Inverse PageRank", PageRankSampler(inverse=True)),
    ("PageRank Split", PageRankSampler(split=True)),
    ("Inverse PageRank Split", PageRankSampler(inverse=True, split=True)),
]

In [62]:
# # Create our transformer, setting the embedding & walking strategy.
# for _, sampler in samplers:
#     embeddings, _ = RDF2VecTransformer(  # type:ignore
#         # Use one worker threads for Word2Vec to ensure random determinism.
#         # Must be used with PYTHONHASHSEED.
#         Word2Vec(workers=1),
#         # Extract a maximum of 100 walks of a maximum depth of 4 for each
#         # entity using two processes and use a random state to ensure that the
#         # same walks are generated for the entities.
#         walkers=[
#             RandomWalker(4, 10, sampler, n_jobs=2, random_state=42)
#         ],
#     ).fit_transform(
    
#         KG("../data/aifb_oaff_complete.nt",
#         skip_predicates={"http://swrc.ontoware.org/ontology#affiliation"}), --> nodewalk
#         entities
#     )


transformer = RDF2VecTransformer(
    Word2Vec(epochs=10),
    walkers=[RandomWalker(4, 10, with_reverse=False, n_jobs=2)],
    # verbose=1
)
# Get our embeddings.
embeddings, literals = transformer.fit_transform(kg, entities)  # gesamter Graph
#print(embeddings) # output fit transform um test und train embedding zu erhalten
embeddings

[array([-0.02353984,  0.02543868, -0.00223127,  0.01260456,  0.00375865,
        -0.03044857,  0.01910145,  0.06088711, -0.01295398, -0.03205185,
         0.00908604, -0.02456785, -0.00014253,  0.01590557,  0.0075167 ,
        -0.02206913,  0.03325885, -0.00932579,  0.00200992, -0.04980784,
         0.02023073,  0.00429499,  0.03045468, -0.0110614 ,  0.01613753,
        -0.00124752, -0.02043858,  0.00294   , -0.02767158, -0.00292862,
         0.01697357, -0.0011718 ,  0.01041992, -0.03315608, -0.00845358,
         0.02903185,  0.01714531, -0.01632143, -0.02377857, -0.01428216,
        -0.00781045, -0.01788268, -0.00518214, -0.00071436,  0.02362747,
        -0.01585401, -0.01059584, -0.00403889,  0.0196328 ,  0.02115961,
         0.01963896, -0.01418552, -0.01363702,  0.00482336, -0.01006017,
        -0.00815564,  0.02284884, -0.0154633 , -0.02288561, -0.00283124,
         0.00120681, -0.00186682,  0.02748417, -0.01442275, -0.04402406,
         0.03825335,  0.00780424,  0.02396793, -0.0

from pykeen.triples import TriplesFactory
training = TriplesFactory.from_path('../data/trainingSet.tsv')

In [33]:
pipeline_result = pipeline(
    #dataset='../data/aifb_oaff_complete.nt',
    model='TransE',
    training='../data/trainingSet.tsv',  # --> gesamten Datensatz verwenden, literals müssen raus 
    testing='../data/pykeenTest.tsv',
)
pipeline_result.save_to_directory('../data/aifb_transE.pkl')



INFO:pykeen.pipeline.api:Using device: None
Training epochs on cpu: 100%|██████████| 5/5 [00:02<00:00,  2.01epoch/s, loss=0.964, prev_loss=0.981]
INFO:pykeen.evaluation.evaluator:Currently automatic memory optimization only supports GPUs, but you're using a CPU. Therefore, the batch_size will be set to the default value.
INFO:pykeen.evaluation.evaluator:No evaluation batch_size provided. Setting batch_size to '32'.
Evaluating on cpu: 100%|██████████| 1.00/1.00 [00:00<00:00, 6.16triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.18s seconds
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=146, num_relations=141, create_inverse_triples=False, num_triples=141, path="C:\Users\luisa\Projekte\Masterthesis\AIFB\data\trainingSet.tsv") to file:///C:/Users/luisa/Projekte/Masterthesis/AIFB/data/aifb_transE.pkl/training_triples
INFO:pykeen.pipeline.api:Saved to directory: file:///C:/Users/luisa/Projekte/Masterthesis/AIFB/data/aifb_transE.pkl


In [29]:
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics

In [50]:
X_train = traindata['person'].array.reshape(-1,1)
y_train = traindata['label_affiliation']
X_test = testdata['person']

encoder = OneHotEncoder()

X_train = encoder.fit_transform(X_train)
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)