In [1]:
%pip install pandas tabula-py rdflib pydotplus pyshacl pykeen scikit-learn matplotlib

Collecting pykeen
  Downloading pykeen-1.10.2-py3-none-any.whl.metadata (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.8/83.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.5.0-cp312-cp312-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting matplotlib
  Using cached matplotlib-3.9.0-cp312-cp312-macosx_10_12_x86_64.whl.metadata (11 kB)
Collecting dataclasses-json (from pykeen)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting scipy>=1.7.0 (from pykeen)
  Downloading scipy-1.14.0-cp312-cp312-macosx_14_0_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting click (from pykeen)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting click-default-group (from pykeen)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.meta

In [6]:
import os
import io
import numpy as np
import pandas as pd
from datetime import datetime
from IPython.display import display, Image
from rdflib import Graph, Namespace, Literal, RDF, RDFS, XSD, URIRef
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from pykeen.predict import predict_target
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from rdflib.tools.rdf2dot import rdf2dot
import pydotplus


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [7]:
g = Graph()
g.parse("data/rdf_mapping.ttl", format="turtle")

<Graph identifier=N57e7660095984e779be155258221e43c (<class 'rdflib.graph.Graph'>)>

In [8]:
from rdflib.plugins.sparql import prepareQuery
# Find all locations where UFO encounters occurred
query1 = prepareQuery("""
    SELECT DISTINCT ?city ?state ?country
WHERE {
    ?encounter a owl:Encounter ;
               rdfs:hasCity ?city ;
               rdfs:hasState ?state ;
               rdfs:hasCountry ?country .
} limit 10

""")

# Execute the query and print the results
for row in g.query(query1):
    print(f" Country = {row.country}, State = {row.state}, City = {row.city} ")

In [9]:
from rdflib.plugins.sparql import prepareQuery
# List UFO shapes reported in the encounters
query2 = prepareQuery("""
    SELECT DISTINCT ?shape
WHERE {
    ?encounter a owl:Encounter ;
               rdfs:hasUFOShape ?shape .
}
 limit 10

""")

# Execute the query and print the results
for row in g.query(query2):
    print(f" Shape = {row.shape}")

 Shape = cylinder
 Shape = light
 Shape = circle
 Shape = sphere
 Shape = disk
 Shape = fireball
 Shape = unknown
 Shape = oval
 Shape = other
 Shape = cigar


In [10]:
from rdflib.plugins.sparql import prepareQuery

# Retrieve details of encounters in Texas (TX)
query3 = prepareQuery("""
   SELECT ?city ?date ?time ?description
WHERE {
    ?encounter a owl:Encounter ;
               rdfs:hasCity ?city ;
               rdfs:hasState "tx"@en ;
               rdfs:hasDate ?date ;
               rdfs:hasTime ?time ;
               rdfs:hasDescription ?description .
} LIMIT 10

""")

# Execute the query and print the results
for row in g.query(query3):
    print(f" City = {row.city}, Date = {row.date}, Time = {row.time}, Description = {row.description}")

In [11]:
from rdflib.plugins.sparql import prepareQuery

# Retrieve details of encounters with date 1949-10-10
query4 = prepareQuery("""
   SELECT ?city ?state ?date ?description
WHERE {
    ?encounter a owl:Encounter ;
               rdfs:hasCity ?city ;
               rdfs:hasState ?state ;
               rdfs:hasDate "1949-10-10"@en ;
               rdfs:hasDescription ?description .
}
 LIMIT 10

""")

# Execute the query and print the results
for row in g.query(query4):
    print(f" City = {row.city}, Date = {row.date}, state = {row.state}, Description = {row.description}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from rdflib import Graph
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

# Function to abbreviate URIs
def abbr(u):
    return u[u.rindex("#")+1:] if "#" in u else u[u.rindex("/")+1:] if "/" in u else u

# Step 1: Load the RDF data
g = Graph()
g.parse("data/rdf_mapping.ttl", format="turtle")
# Step 2: Load the RDF data into an RDFlib Graph

# Step 3: Extract triples from the RDF graph and abbreviate URIs
triples = []
for s, p, o in g:
    triples.append((abbr(str(s)), abbr(str(p)), abbr(str(o))))

# Step 4: Convert triples to a numpy array for PyKEEN
gdata = np.array(triples)

# Step 5: Create TriplesFactory and split into train, test, validation sets
tf = TriplesFactory.from_labeled_triples(gdata)
train, test, validation = tf.split([0.6, 0.2, 0.2], random_state=42)

# Step 6: Train the knowledge graph embedding model with PyKEEN
result = pipeline(
    training=train,
    testing=test,
    validation=validation,
    model='TransE',
    epochs=10,
    dimensions=128,
    negative_sampler="basic",
    random_seed=42
)

# Step 7: Extract embeddings
entity_labels = list(train.entity_labeling.all_labels())
embeddings = np.array([result.model.entity_representations[0]()[entity_labels.index(ent)].detach().numpy() for ent in entity_labels])

# Step 8: Perform KMeans clustering
num_clusters = 3  # Number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
labels = kmeans.fit_predict(embeddings)

# Step 9: Perform t-SNE for visualization
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

# Step 10: Plot the clusters
plt.figure(figsize=(10, 8))
for i in range(num_clusters):
    points = embeddings_2d[labels == i]
    plt.scatter(points[:, 0], points[:, 1], label=f'Cluster {i}')
plt.legend()
plt.title('UFO Sightings Clustering')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()


In [None]:
# Extract triples from the RDFLib graph
triples = []
for s, p, o in g.triples((None, None, None)):
    triples.append((str(s), str(p), str(o)))

# Convert triples to a NumPy array
triples_array = np.array(triples)

# Create a DataFrame from the triples
triples_df = pd.DataFrame(triples, columns=["head", "relation", "tail"])

# Ensure specific triples are included in the training set
specific_triples = [
    ('http://webprotege.stanford.edu/1', 'http://www.w3.org/2000/01/rdf-schema#hasLocation', 'http://webprotege.stanford.edu/location1'),
    ('http://webprotege.stanford.edu/2000', 'http://www.w3.org/2000/01/rdf-schema#hasLocation', 'http://webprotege.stanford.edu/location2000'),
]

# Filter out the specific triples from the main dataset
remaining_triples_df = triples_df[~triples_df.apply(tuple, axis=1).isin(specific_triples)]

# Split the remaining data into training and testing sets
remaining_train_triples, test_triples = train_test_split(remaining_triples_df, test_size=0.2, random_state=42)

# Add the specific triples to the training set
train_triples = pd.concat([pd.DataFrame(specific_triples, columns=["head", "relation", "tail"]), remaining_train_triples])

# Create TriplesFactory from the training and testing sets
train_tf = TriplesFactory.from_labeled_triples(train_triples.values)
test_tf = TriplesFactory.from_labeled_triples(test_triples.values)

# Train a model using PyKEEN pipeline
result = pipeline(
    training=train_tf,
    testing=test_tf,
    model='TransE',
)

# Function to safely get entity ID
def get_entity_id(tf, entity):
    try:
        return tf.entity_to_id[entity]
    except KeyError:
        print(f"Entity {entity} not found in the training data.")
        return None

# Function to safely get relation ID
def get_relation_id(tf, relation):
    try:
        return tf.relation_to_id[relation]
    except KeyError:
        print(f"Relation {relation} not found in the training data.")
        return None

# Perform link prediction
head_entity = 'http://webprotege.stanford.edu/1'
relation_label = 'http://www.w3.org/2000/01/rdf-schema#hasLocation'

head_id = get_entity_id(train_tf, head_entity)
relation_id = get_relation_id(train_tf, relation_label)

if head_id is not None and relation_id is not None:
    predictions = predict_target(
        model=result.model,
        head=head_id,
        relation=relation_id,
        triples_factory=train_tf,
    )
    print("Link Prediction Results:")
    print(predictions)

# Perform relation prediction
tail_entity = 'http://webprotege.stanford.edu/2'

tail_id = get_entity_id(train_tf, tail_entity)

if head_id is not None and tail_id is not None:
    predictions = predict_target(
        model=result.model,
        head=head_id,
        tail=tail_id,
        triples_factory=train_tf,
    )
    print("\nRelation Prediction Results:")
    print(predictions)