# Populate the Game of Thrones (GOT) Ontology

## Import libraries

In [1]:
# required libraries
import pandas as pd
import os
from pathlib import Path
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# CHECK DATE 
import datetime

import requests
import json

# Regular expression operations Library
import re


## Parameters and URLs

In [2]:
# parameters and URLs

# URL for the wikidata query service
wikidataUrl = 'https://query.wikidata.org/sparql'


# set the working directory
path = str(Path(os.path.abspath(os.getcwd())))
rawPath = path + '/rawData'

# This file is available at https://www.kaggle.com/datasets/dalmacyali1905/game-of-thrones-classification-decision-tree
mainDataFile = rawPath + '/HW2.csv'

# This file is available at https://www.kaggle.com/datasets/prashant111/game-of-thrones?select=battles.csv
battlesFile= rawPath + '/battles.csv'

# This file is available at https://www.kaggle.com/datasets/neelgajare/463-game-of-thrones-houses-dataset
housesFile = rawPath + '/gameofthrones.csv'

# saving folder
savePath =  path + '/rdf'

# Book of the series
bookTitle = ['A Game of Thrones', 'A Clash of Kings', 'A Storm of Swords', 'A Feast for Crows', 'A Dance with Dragons']


# This character is used instead of null values
nullCharacter = '-1'

## Find wikidata URI's of Characters

In [3]:
def readDataFromURL(url, query, savePath='result.csv'):
    # read the data from the URL
    r = requests.get(url, params = {'format': 'json', 'query': query})
    # convert the data to json
    data = r.json()
    # convert the json to a dataframe
    df = pd.json_normalize(data['results']['bindings'])
    # save the dataframe to a csv file
    if df.size > 0:
        df.to_csv(savePath, index=False)
        return df
    else:
        return None

In [4]:
# query = '''
# SELECT DISTINCT ?item ?itemLabel ?gender ?father ?mother ?spouse ?family 
# WHERE 
# {
#   ?item wdt:P31 wd:Q20086260.
#   #?item wdt:P31 wd:Q20086263.
#   OPTIONAL{?item wdt:P21 ?gender .}
#   OPTIONAL{?item wdt:P22 ?father .}
#   OPTIONAL{?item wdt:P25 ?mother .}
#   OPTIONAL{?item wdt:P26 ?spouse .}
#   OPTIONAL{?item wdt:P53 ?family .}
  
#   SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } # Helps get the label in your language, if not, then en language
# }
# '''

In [5]:
query = '''
SELECT DISTINCT ?item ?itemLabel 
WHERE 
{
  ?item wdt:P31 wd:Q20086260.
  #?item wdt:P31 wd:Q20086263.

  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } # Helps get the label in your language, if not, then en language
}
'''

wikiDataCharacters = readDataFromURL(wikidataUrl, query, rawPath + '/wikidataUri.csv')

if wikiDataCharacters is not None:
    display(wikiDataCharacters.head())
else:
    print('No data found!')


Unnamed: 0,item.type,item.value,itemLabel.xml:lang,itemLabel.type,itemLabel.value
0,uri,http://www.wikidata.org/entity/Q5800500,en,literal,Davos Seaworth
1,uri,http://www.wikidata.org/entity/Q5959381,en,literal,Khal Drogo
2,uri,http://www.wikidata.org/entity/Q1120793,en,literal,Theon Greyjoy
3,uri,http://www.wikidata.org/entity/Q3472490,en,literal,Sansa Stark
4,uri,http://www.wikidata.org/entity/Q12900933,en,literal,Margaery Tyrell


## Namespaces

In [6]:
# Construct the country and the movie ontology namespaces not known by RDFlib
CLT = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
GOT = Namespace("http://www.dei.unipd.it/database2/gotOntology#")
WD = Namespace("http://www.wikidata.org/entity/")

## Load and Clean Data

### Characters DataSet

In [7]:
# Load the CSV files in memory and cleaning the data
charactersDataset = pd.read_csv(mainDataFile, sep=',', index_col='S.No').fillna(nullCharacter)

# Remove unuseful columns
charactersDataset.drop(['plod','isAliveMother','isAliveFather','isAliveHeir','isAliveSpouse','isMarried','numDeadRelations','boolDeadRelations' ], inplace=True, axis=1)

# Replace gender values with the string values
charactersDataset['gender'] = charactersDataset['gender'].apply(lambda x: ['female', 'male'][x])

# Rename the columns
charactersDataset.rename(columns={'DateoFdeath': 'dateOfDeath', 'mother':'hasMother', 'father':'hasFather', 'spouse':'hasSpouse',
'heir':'hasHeir', 'house':'hasHouse'}, inplace=True)

# Remove word 'House ' from the name of all houses
charactersDataset['hasHouse'] = charactersDataset['hasHouse'].apply(lambda x: x.replace('House of ','').replace('House ',''))

# Display the first 5 rows of the dataset
charactersDataset.head()

Unnamed: 0_level_0,name,title,gender,culture,dateOfBirth,dateOfDeath,hasMother,hasFather,hasHeir,hasHouse,...,book1,book2,book3,book4,book5,isNoble,age,isPopular,popularity,isAlive
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Viserys II Targaryen,-1,male,-1,-1.0,-1.0,Rhaenyra Targaryen,Daemon Targaryen,Aegon IV Targaryen,-1,...,0,0,0,0,0,0,-1.0,1,0.605351,0
2,Walder Frey,Lord of the Crossing,male,Rivermen,208.0,-1.0,-1,-1,-1,Frey,...,1,1,1,1,1,1,97.0,1,0.896321,1
3,Addison Hill,Ser,male,-1,-1.0,-1.0,-1,-1,-1,Swyft,...,0,0,0,1,0,1,-1.0,0,0.267559,1
4,Aemma Arryn,Queen,female,-1,82.0,105.0,-1,-1,-1,Arryn,...,0,0,0,0,0,1,23.0,0,0.183946,0
5,Sylva Santagar,Greenstone,female,Dornish,276.0,-1.0,-1,-1,-1,Santagar,...,0,0,0,1,0,1,29.0,0,0.043478,1


### Region & House DataSet

In [8]:
# Load the CSV files in memory and cleaning the data
housesDataset = pd.read_csv(housesFile, sep=',').fillna(nullCharacter)

# Remove useless columns
housesDataset.drop(['Seat', 'Words', 'Notes', 'Ancestral weapon'], inplace=True, axis=1)

# Rename the columns
housesDataset.rename(columns={'Region': 'region', 'House':'house', 'Sigil':'sigil','Blazon':'blazon', 'Origin':'founder'}, inplace=True)

# Remove word 'House ' from name of all houses
housesDataset['house'] = housesDataset['house'].apply(lambda x: x.replace('House of ','').replace('House ',''))

# Remove word 'Unknown' from region of all houses
housesDataset['region'] = housesDataset['region'].apply(lambda x: x.replace('Unknown', nullCharacter))

# Remove word 'House ' from blazon of all houses
housesDataset['blazon'] = housesDataset['blazon'].apply(lambda x: x.replace('Unknown', nullCharacter))

# Display the first 5 rows of the dataset
display(housesDataset.head())

Unnamed: 0,region,house,sigil,blazon,founder
0,Westerlands,Algood,https://awoiaf.westeros.org/index.php/File:Hou...,"A golden wreath on blue, a gold border",-1
1,Dorne,Allyrion of Godsgrace,https://awoiaf.westeros.org/index.php/File:Hou...,A golden hand on gyronny red and black,-1
2,North,Amber,https://awoiaf.westeros.org/index.php/File:Non...,-1,-1
3,Reach,Ambrose,https://awoiaf.westeros.org/index.php/File:Hou...,A yellow field strewn with red ants,-1
4,Iron Islands (Shield Islands),Andrik of Southshield,https://awoiaf.westeros.org/index.php/File:Non...,-1,Andrik the Unsmiling


### Battles DataSet

In [9]:
# Load the CSV files in memory and cleaning the data
battlesDataset = pd.read_csv(battlesFile, sep=',').fillna(nullCharacter)

# Remove useless columns
battlesDataset.drop(['major_death','major_capture','note'], inplace=True, axis=1)


# Rename the columns
battlesDataset.rename(columns={'battle_number': 'battleNumber', 'attacker_king':'attackerKing', 'defender_king':'defenderKing',
'attacker_outcome':'attackerOutcome', 'battle_type':'battleType', 'attacker_size':'attackerSize', 'defender_size':'defenderSize',
'attacker_commander':'attackerCommander', 'defender_commander':'defenderCommander'}, inplace=True)


# Complete name and seperate kings in 'attackerKing' column by comma
def makeUriFromName(name):
    if '/' in name:
        nameParts = re.split('/| ', name)
        nameList = nameParts[0] + ' ' + nameParts[2] +',' + nameParts[1] + ' ' + nameParts[2]
        return nameList
    else:
        return name

battlesDataset['attackerKing'] = battlesDataset['attackerKing'].apply(lambda x: makeUriFromName(x))
battlesDataset['defenderKing'] = battlesDataset['defenderKing'].apply(lambda x: makeUriFromName(x))

# Remove word 'The ' from the name of all regions
battlesDataset['region'] = battlesDataset['region'].apply(lambda x: x.replace('The ',''))

# Splitting columns with more than one character to more columns so that each column contains just one name
columnSplit = ['attackerKing',	'defenderKing', 'attackerCommander',	'defenderCommander']
columnDict = {}
for col in columnSplit:
    tmp = battlesDataset[col].str.split(",", expand=True).fillna(nullCharacter)
    tmp.columns = [col + str(x) for x in tmp.columns ]
    battlesDataset = pd.concat([battlesDataset, tmp], axis=1)
    battlesDataset.drop(col, axis =1, inplace=True)
    columnDict[col] = len(tmp.columns)

# Display the first 5 rows of the dataset
display(battlesDataset.head())


Unnamed: 0,name,year,battleNumber,attacker_1,attacker_2,attacker_3,attacker_4,defender_1,defender_2,defender_3,...,attackerCommander3,attackerCommander4,attackerCommander5,defenderCommander0,defenderCommander1,defenderCommander2,defenderCommander3,defenderCommander4,defenderCommander5,defenderCommander6
0,Battle of the Golden Tooth,298,1,Lannister,-1,-1,-1,Tully,-1,-1,...,-1,-1,-1,Clement Piper,Vance,-1,-1,-1,-1,-1
1,Battle at the Mummer's Ford,298,2,Lannister,-1,-1,-1,Baratheon,-1,-1,...,-1,-1,-1,Beric Dondarrion,-1,-1,-1,-1,-1,-1
2,Battle of Riverrun,298,3,Lannister,-1,-1,-1,Tully,-1,-1,...,-1,-1,-1,Edmure Tully,Tytos Blackwood,-1,-1,-1,-1,-1
3,Battle of the Green Fork,298,4,Stark,-1,-1,-1,Lannister,-1,-1,...,Harrion Karstark,Halys Hornwood,-1,Tywin Lannister,Gregor Clegane,Kevan Lannister,Addam Marbrand,-1,-1,-1
4,Battle of the Whispering Wood,298,5,Stark,Tully,-1,-1,Lannister,-1,-1,...,-1,-1,-1,Jaime Lannister,-1,-1,-1,-1,-1,-1


### List of all Characters, Regions, Houses, Cultures, Battles

In [10]:
# List all unique values of characters in a list
allCharacters = list(charactersDataset['name'].unique())
allCharacters = allCharacters + list(charactersDataset['hasFather'].unique())
allCharacters = allCharacters + list(charactersDataset['hasMother'].unique())
allCharacters = allCharacters + list(charactersDataset['hasSpouse'].unique())
allCharacters = allCharacters + list(charactersDataset['hasHeir'].unique())

allCharacters = allCharacters + list(housesDataset['founder'].unique())

for col in columnDict:
    for i in range(columnDict[col]):
        allCharacters = allCharacters + list(battlesDataset[col + str(i)].unique())

allCharacters = list(set(allCharacters))
allCharacters.sort()
if nullCharacter in allCharacters:
    allCharacters.remove(nullCharacter)



# List all unique values of region in a list
allRegions = list(housesDataset['region'].unique())

allRegions = allRegions + list(battlesDataset['region'].unique())
allRegions.sort()
if nullCharacter in allRegions:
    allRegions.remove(nullCharacter)



# List all unique values of house in a list
allHouses = list(housesDataset['house'].unique())

allHouses = allHouses + list(charactersDataset['hasHouse'].unique())
for i in range(1,5):
    allHouses = allHouses + list(battlesDataset['attacker_'+str(i)].unique())
    allHouses = allHouses + list(battlesDataset['defender_'+str(i)].unique())

allHouses = list(set(allHouses))
allHouses.sort()
if nullCharacter in allHouses:
    allHouses.remove(nullCharacter)



# List all unique values of cultures in a list
allCultures = list(charactersDataset['culture'].unique())
allCultures.sort()
if nullCharacter in allCultures:
    allCultures.remove(nullCharacter)



# List all unique values of battles in a list
allBattles = list(battlesDataset['name'].unique())
allBattles.sort()
if nullCharacter in allBattles:
    allBattles.remove(nullCharacter)


## Creat Graph

In [11]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("wd", WD)
g.bind("got", GOT)

### Add Books

In [12]:
# Create the RDF node 
for i in range(5):
    idBook = 'book'+str(i+1)
    book = URIRef(GOT[idBook])
    g.add((book, RDF.type, GOT.Book))
    g.add((book, GOT.Title, Literal(bookTitle[i], datatype=XSD.string)))

### Add Regions

In [13]:
# Add regions to the graph
for i in range(len(allRegions)):
    idRegion = 'region' + str(i)
    region = URIRef(GOT[idRegion])
    g.add((region, RDF.type, GOT.Region))
    g.add((region, GOT.name, Literal(allRegions[i], datatype=XSD.string)))

### Add Houses

In [14]:
# Add houses to the graph
for i in range(len(allHouses)):
    idHouse = 'house' + str(i)
    house = URIRef(GOT[idHouse])
    g.add((house, RDF.type, GOT.House))
    g.add((house, GOT.name, Literal(allHouses[i], datatype=XSD.string)))

### Add Characters and wikidataProfiles

In [15]:
# Add characters to the graph
for i in range(len(allCharacters)):
    idC = 'character' + str(i)
    character = URIRef(GOT[idC])
    g.add((character, RDF.type, GOT.Character))
    g.add((character, GOT.name, Literal(allCharacters[i], datatype=XSD.string)))


    if allCharacters[i] in wikiDataCharacters['itemLabel.value'].values:
        wikidataProfile = URIRef(wikiDataCharacters[wikiDataCharacters['itemLabel.value'] == allCharacters[i]]['item.value'].values[0])
        g.add((wikidataProfile, RDF.type, GOT.wikidataProfile))
        g.add((wikidataProfile, GOT['isAbout'], character))

### Add Battles

In [16]:
# Add battles to the graph
for i in range(len(allBattles)):
    idB = 'battle' + str(i)
    battle = URIRef(GOT[idB])
    g.add((battle, RDF.type, GOT.Battle))
    g.add((battle, GOT.name, Literal(allBattles[i], datatype=XSD.string)))

### Add Relations and Properties of Houses

In [17]:
%%time 
#measure execution time


#iterate over the house dataframe
for index, row in housesDataset.iterrows():

    idH = "house" + str(allHouses.index(row['house']))
    house = URIRef(GOT[idH])

    if row['region'] != nullCharacter:
        g.add((house, GOT['region'], URIRef(GOT['region' + str(allRegions.index(row['region']))])))

    if row['founder'] != nullCharacter:
        g.add((house, GOT['hasFounder'], URIRef(GOT['character' + str(allCharacters.index(row['founder']))])))
        g.add((URIRef(GOT['character' + str(allCharacters.index(row['founder']))]), GOT['founderOf'], house))

    if row['sigil'] != nullCharacter:
        g.add((house, GOT['sigil'], Literal(row['sigil'], datatype=XSD.string)))

    if row['blazon'] != nullCharacter:
        g.add((house, GOT['blazon'], Literal(row['blazon'], datatype=XSD.string)))

CPU times: total: 203 ms
Wall time: 199 ms


### Add Relations and Properties of Characters

In [18]:
characterColumns = [ 'hasMother', 'hasFather', 'hasHeir', 'hasSpouse']
stringColumns = ['title', 'gender', 'culture']
integerColumns = ['dateOfBirth', 'dateOfDeath','age']
bookColumns = ['book1', 'book2', 'book3', 'book4', 'book5']
booleanColumns = ['isNoble','isPopular', 'isAlive']
floatColumns = ['popularity']

In [19]:
%%time 
#measure execution time

#iterate over the characters dataframe
for index, row in charactersDataset.iterrows():

    idC = "character" + str(allCharacters.index(row['name']))
    character = URIRef(GOT[idC])

    for column in stringColumns:
        if row[column] != nullCharacter:
            g.add((character, GOT[column], Literal(row[column], datatype=XSD.string)))

    for column in integerColumns:
        if row[column] != nullCharacter:
            g.add((character, GOT[column], Literal(int(row[column]), datatype=XSD.integer)))

    for column in bookColumns:
        if row[column] == 1:
            g.add((character, GOT['apearIn'], URIRef(GOT[column])))
    
    for column in booleanColumns:
        if row[column] != nullCharacter:
            g.add((character, GOT[column], Literal(row[column], datatype=XSD.boolean)))

    for column in floatColumns:
        if row[column] != nullCharacter:
            g.add((character, GOT[column], Literal(row[column], datatype=XSD.float)))

    for column in characterColumns:
        if row[column] != nullCharacter:
            g.add((character, GOT[column], URIRef(GOT["character" + str(allCharacters.index(row[column]))])))

    if row['hasHouse'] != nullCharacter:
        g.add((character, GOT['hasHouse'], URIRef(GOT['house' + str(allHouses.index(row['hasHouse']))])))


CPU times: total: 2.02 s
Wall time: 2.22 s


### Add Relations and Properties of Battles

In [20]:
stringColumns = ['location', 'attackerOutcome', 'battleType']
integerColumns = ['year', 'attackerSize','defenderSize']

In [21]:
%%time 
#measure execution time


#iterate over the battle dataframe
for index, row in battlesDataset.iterrows():

    idB = "battle" + str(allBattles.index(row['name']))
    battle = URIRef(GOT[idB])

    for column in stringColumns:
        if row[column] != nullCharacter:
            g.add((battle, GOT[column], Literal(row[column], datatype=XSD.string)))

    for column in integerColumns:
        if row[column] != nullCharacter:
            g.add((battle, GOT[column], Literal(int(row[column]), datatype=XSD.integer)))

    if row['region'] != nullCharacter:
        g.add((battle, GOT['region'], URIRef(GOT['region' + str(allRegions.index(row['region']))])))

    for i in range(1,5):
        if row['attacker_' + str(i)] != nullCharacter:
            g.add((battle, GOT['attackerHouse'], URIRef(GOT['house' + str(allHouses.index(row['attacker_' + str(i)]))])))
        if row['defender_' + str(i)] != nullCharacter:
            g.add((battle, GOT['defenderHouse'], URIRef(GOT['house' + str(allHouses.index(row['defender_' + str(i)]))])))
    

    for item in columnDict:
        for i in range(columnDict[item]):
            if row[item + str(i)] != nullCharacter:
                g.add((battle, GOT[item], URIRef(GOT['character' + str(allCharacters.index(row[item + str(i)]))])))



CPU times: total: 62.5 ms
Wall time: 62 ms


### Save Graph

In [22]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + '/GOT.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))


--- saving serialization ---
CPU times: total: 2.41 s
Wall time: 3.04 s
