In [1]:
import pandas as pd 
import json

from rdflib import Graph, Literal, RDF, URIRef, Namespace #basic RDF handling
from rdflib.namespace import FOAF , XSD, RDF, RDFS, OWL #most common namespaces
import urllib.parse #for parsing strings to URI's


In [2]:
#load the .json file
with open('movie_character.json') as f:
    data = json.load(f)

In [3]:
#create a set for the distinct characters that appear in the file
characters = set()
for movie in data['movies']:
    for character in movie['main characters']:
        characters.add(character)
print(characters)

{'Gamora', 'Senator Stern', 'Pepper Potts', 'Black Panther', 'Quicksilver', 'Sif', 'Heinz Kruger', 'Obadiah Stane', 'Carina', 'Paxton', 'Agent Coulson', 'Ant-Man', 'Ned Leeds', 'Howard Stark', 'Luis', 'Nakia', 'Mysterio', 'Iron Man', 'Yon-Rogg', 'Justin Hammer', 'Captain Marvel', 'Dr. Abraham Erskine', 'Erik Killmonger', 'Hulk', 'Nebula', 'Grandmaster', 'Wasp', 'Ego', "General 'Thunderbolt' Ross", 'Ivan Vanko', 'Christine Everhart', 'Malekith', 'General Gabriel', 'Abu Bakaar', 'Dr. Arnim Zola', 'Valkyrie', 'Savin', 'Flash', 'Shuri', 'Hogun', 'Falcon', 'Liz', 'Leonard', 'Odin', 'On-Set Rocket', 'Drax', 'Yellowjacket', 'Raza', 'Jane Foster', 'Major Kathleen Sparr', 'Frigga', 'Captain America', 'Peggy Carter', 'Minn-Erva', 'Happy Hogan', 'Scarlet Witch', 'General Joe Greller', 'Hogan', 'Mantis', 'Zuri', 'Cassie', 'Keller', 'Volstagg', 'Ghost', 'Agent Maria Hill', "James Buchanan 'Bucky' Barnes", 'Okoye', 'Erik Selvig', 'Doctor Strange', 'Winter Soldier', 'Mordo', 'Sonny Burch', 'Nick Fury

In [4]:
#define function to clean the names of the characters
def clean_hero(name):
    cleaned_name = name
    cleaned_name = cleaned_name.lower() \
                    .replace(" ","_").replace("/","_").replace(",","_").replace("-","_") \
                    .replace("[_]+", "_") \
                    .strip("_")
    return cleaned_name


In [5]:
#define graph
g = Graph()
#define namespaces
dbo = Namespace('http://dbpedia.org/ontology/')
schema = Namespace('http://schema.org/')
wd = Namespace('https://www.wikidata.org/wiki/')
ken = Namespace('https://w3id.org/um/ken4256/')
ken_characters = Namespace('https://w3id.org/um/ken4256/characters/')
ken_movies = Namespace('https://w3id.org/um/ken4256/movies/')
cbo = Namespace('http://comicmeta.org/cbo/')

In [6]:
#build RDF with triples (add label and type for each character from the created characters set)
g = Graph()
for character in characters:
    g.add((URIRef(ken_characters+urllib.parse.quote(character,safe='')), RDFS.label, Literal(character, lang="en") ))
    g.add((URIRef(ken_characters+urllib.parse.quote(character,safe='')), RDF.type, URIRef(cbo.Character) ))

In [7]:
#Bind the namespaces for prefixes for a more readable output
g.bind("foaf", FOAF)
g.bind("wd",wd)
g.bind("schema",schema)
g.bind("dbo",dbo)
g.bind("ken",ken)
g.bind("ken_characters",ken_characters)
g.bind("ken_movies",ken_movies)
g.bind("cbo",cbo)

In [8]:
#print(g.serialize(format='turtle').decode('UTF-8'))

In [9]:
#Save output
g.serialize('output_character_label.ttl',format='turtle')