# Knowledge Graph build and data processing
<br>
1) Build knowledge graph using RDFLib, based on SiriusGeoOnto format
<br>
2) Add classes and individuals to the knowledge graph from geolit_dv.csv
<br>
3) Extract triples from turtle file and remove URI
<br>
4) Knowledge graph metrics
<br>
5) Split data to evaluation into train, validation, test
<br>
6) Extract entites and relations for modelling

## 1) Build knowledge graph using RDFLib, based on SiriusGeoOnto format

In [1]:
from rdflib import Graph, Literal, RDF, URIRef, BNode, Literal, Namespace
from rdflib.namespace import FOAF, XSD, RDF, RDFS, DCTERMS, OWL

# Initialise graph
g = Graph()

In [2]:
#Import namespace from SiriusGeoOnto to use are URI
n = Namespace('http://no.sirius.ontology/geological-ontology#')
g.bind('', n)
g.bind('rdf', RDF)
g.bind('terms', DCTERMS)
g.bind('owl', OWL)

In [3]:
#Object properties

hasNext = n['hasNext']
g.add((hasNext, RDF.type, OWL.ObjectProperty))        

hasDirectNext = n['hasDirectNext']
g.add((hasDirectNext, RDF.type, OWL.ObjectProperty))
g.add((hasDirectNext, RDFS.subPropertyOf, hasNext))

hasPrevious = n['hasPrevious']
g.add((hasPrevious, RDF.type, OWL.ObjectProperty))  
g.add((hasPrevious, OWL.inverseOf, hasNext))  

hasDirectPrevious = n['hasDirectPrevious']
g.add((hasDirectPrevious, RDF.type, OWL.ObjectProperty))
g.add((hasDirectNext, RDFS.subPropertyOf, hasDirectPrevious))
g.add((hasDirectPrevious, OWL.inverseOf, hasDirectNext)) 

isMemberOf = n['isMemberOf']
g.add((isMemberOf, RDF.type, OWL.ObjectProperty))

depositedIn = n['depositedIn']
g.add((depositedIn, RDF.type, OWL.ObjectProperty))

constitutedBy = n['constitutedBy']
g.add((constitutedBy, RDF.type, OWL.ObjectProperty))

hasAge = n['hasAge']
g.add((hasAge, RDF.type, OWL.ObjectProperty))

contains = n['contains']
g.add((contains, RDF.type, OWL.ObjectProperty))

locatedIn= n['locatedIn']
g.add((locatedIn, RDF.type, OWL.ObjectProperty))

In [4]:
#Depositional environment class and subclasses

DepositionalEnvironment = n['DepositionalEnvironment']
g.add((DepositionalEnvironment, RDF.type, OWL.Class))

DepositionalEnvironmentSub = ['ContinentalDepositionalEnvironment', 'GlacialDepositionalEnvironment', 'MarineDepositionalEnvironment', 'TransitionalDepositionalEnvironment', 'VolcanicDepositionalEnvironment']

for className in DepositionalEnvironmentSub:
    className = n[className]
    g.add((className, RDF.type, OWL.Class))
    g.add((className, RDFS.subClassOf, DepositionalEnvironment))

    ContinentalDepositionalEnvironment = n['ContinentalDepositionalEnvironment']
    ContinentalDepositionalEnvironmentSub = ['AeolianDepositionalEnvironment', 'AlluvialDepositionalEnvironment', 'FluvialDepositionalEnvironment', 'LacustrineDepositionalEnvironment']
    for className in ContinentalDepositionalEnvironmentSub:
        className = n[className]
        g.add((className, RDF.type, OWL.Class))
        g.add((className, RDFS.subClassOf, ContinentalDepositionalEnvironment))

        AeolianDepositionalEnvironment = n['AeolianDepositionalEnvironment']
        AeolianDepositionalEnvironmentSub = ['AeolianDepositionalEnv']
        for item in AeolianDepositionalEnvironmentSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, AeolianDepositionalEnvironment))

        AlluvialDepositionalEnvironment = n['AlluvialDepositionalEnvironment']
        AlluvialDepositionalEnvironmentSub = ['AlluvialChannel', 'AlluvialDepositionalEnv', 'AlluvialFan']
        for item in AlluvialDepositionalEnvironmentSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, AlluvialDepositionalEnvironment))

        FluvialDepositionalEnvironment = n['FluvialDepositionalEnvironment']
        FluvialDepositionalEnvironmentSub = ['FluvialChannel', 'FluvialDepositionalEnv', 'FluvioDeltaicDepositionalEnvironment', 'FluvioLacustrineDepositionalEnv']
        for item in FluvialDepositionalEnvironmentSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, FluvialDepositionalEnvironment))

        LacustrineDepositionalEnvironment = n['LacustrineDepositionalEnvironment']
        LacustrineDepositionalEnvironmentSub = ['FluvioLacustrineDepositionalEnvironment', 'LacustrineDepositionalEnv']
        for item in LacustrineDepositionalEnvironmentSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, LacustrineDepositionalEnvironment))
    
## Marine Depositonal Environment
    MarineDepositionalEnvironment = n['MarineDepositionalEnvironment']
    MarineDepositionalEnvSub = ['CoastalToShallowMarineDepositionalEnvironment', 'InnerShelfDepositionalEnvironment', 'MarineDepositionalEnv', 
                                 'OpenMarine', 'OuterShelfDepositionalEnvironment', 'ReefalDepositionalEnvironment', 'ShelfDepositionalEnvironment', 'SlopeDepositionalEnvironment']
    for marineSub in MarineDepositionalEnvSub:
        marineSub = n[marineSub]        
        g.add((marineSub, RDF.type, OWL.NamedIndividual))
        g.add((marineSub, RDF.type, MarineDepositionalEnvironment))

    MarineDepositionalEnvironmentSub = ['DeepMarineDepositionalEnvironment', 'ShallowMarineDepositionalEnvironment']
    for className in MarineDepositionalEnvironmentSub:
        className = n[className]
        g.add((className, RDF.type, OWL.Class))
        g.add((className, RDFS.subClassOf, MarineDepositionalEnvironment))

        DeepMarineDepositionalEnvironment = n['DeepMarineDepositionalEnvironment']
        DeepMarineDepositionalEnvironmentSub = ['AbyssalChannel', 'AbyssalFan', 'DeepMarineDepositionalEnvironment', 'ShallowMarineToDeepMarineDepositionalEnvironment', 'Turbidity-flowDeposit']
        for item in DeepMarineDepositionalEnvironmentSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, DeepMarineDepositionalEnvironment))    
    
        ShallowMarineDepositionalEnvironment = n['ShallowMarineDepositionalEnvironment']
        ShallowMarineDepositionalEnvironmentSub = ['ShallowMarineToDeepMarineDepositionalEnvironment', 'ShallowMarineDepositionalEnv']
        for item in ShallowMarineDepositionalEnvironmentSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, ShallowMarineDepositionalEnvironment))    

##Transitional Environment         
    TransitionalDepositionalEnvironment = n['TransitionalDepositionalEnvironment']
    TransitionalDepositionalEnvSub = ['CoastalPlain', 'CoastalToShallowMarineDepositionalEnvironment', 'LowerCoastalPlain', 'UpperCoastalPlain']
    for transitionalSub in TransitionalDepositionalEnvSub:
        transitionalSub = n[transitionalSub]  
        g.add((transitionalSub, RDF.type, OWL.NamedIndividual))
        g.add((transitionalSub, RDF.type, TransitionalDepositionalEnvironment))

    ##Transitional Environment    
    TransitionalDepositionalEnvironment = n['TransitionalDepositionalEnvironment']    
    TransitionalDepositionalEnvironmentSub = ['BeachDepositionalEnvironment', 'DeltaicDepositionalEnvironment', 'LagoonalDepositionalEnvironment', 'TidalDepositionalEnvironment']
    for className in TransitionalDepositionalEnvironmentSub:
        className = n[className]
        g.add((className, RDF.type, OWL.Class))
        g.add((className, RDFS.subClassOf, TransitionalDepositionalEnvironment))  
    
        BeachDepositionalEnvironment = n['BeachDepositionalEnvironment']
        BeachDepositionalEnvironmentSub = ['BackshoreDepositionalEnvironment', 'BeachDepositionalEnvironment', 'ForeshoreDepositionalEnvironment', 'LowerShorefaceDepositionalEnvironment', 'ShorefaceDepositionalEnvironment', 'UppershorefaceDepositionalEnvironment']
        for item in BeachDepositionalEnvironmentSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, BeachDepositionalEnvironment))   

        DeltaicDepositionalEnvironment = n['DeltaicDepositionalEnvironment']
        DeltaicDepositionalEnvironmentSub = ['Clinoforms', 'Delta', 'DeltaFrontDepositionalEnvironment', 'DeltaPlainDepositionalEnvironment', 'FluvioDeltaicDepositionalEnvironment', 'ProdeltaDepositionalEnvironment']
        for item in DeltaicDepositionalEnvironmentSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, DeltaicDepositionalEnvironment))   

        LagoonalDepositionalEnvironment = n['LagoonalDepositionalEnvironment']
        LagoonalDepositionalEnvironmentSub = ['LagoonalDepositionalEnv']
        for item in LagoonalDepositionalEnvironmentSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, LagoonalDepositionalEnvironment))   

        TidalDepositionalEnvironment = n['TidalDepositionalEnvironment']
        TidalDepositionalEnvironmentSub = ['SubtidalPlatform', 'TidalFlatDepositionalEnv', 'TidalPlatform']
        for item in TidalDepositionalEnvironmentSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, TidalDepositionalEnvironment)) 
    
GlacialDepositionalEnvironment = n['GlacialDepositionalEnvironment']
GlacialDepositionalEnvironmentSub = ['GlacialDepositionalEnv']
for item in GlacialDepositionalEnvironmentSub:
    item = n[item]
    g.add((item, RDF.type, OWL.NamedIndividual))
    g.add((item, RDF.type, GlacialDepositionalEnvironment)) 

VolcanicDepositionalEnvironment= n['VolcanicDepositionalEnvironment']
VolcanicDepositionalEnvironmentSub = ['VolcanicDepositionalEnv']
for item in VolcanicDepositionalEnvironmentSub:
    item = n[item]
    g.add((item, RDF.type, OWL.NamedIndividual))
    g.add((item, RDF.type, VolcanicDepositionalEnvironment)) 

In [5]:
#Matter class, fossil and rock subclasses and individuals

Matter = n['Matter']
g.add((Matter, RDF.type, OWL.Class))

MatterSub = ['Fossil', 'Rock']
for className in MatterSub:
    className = n[className]
    g.add((className, RDF.type, OWL.Class))
    g.add((className, RDFS.subClassOf, Matter))

    Fossil = n['Fossil']
    FossilSub = ['CalcareousFossil', 'AgglutinatedFossil', 'SiliceousFossil', 'PhosphaticFossil', 'OrganicWalledFossil']
    for className in FossilSub:
        className = n[className]
        g.add((className, RDF.type, OWL.Class))
        g.add((className, RDFS.subClassOf, Fossil))

        CalcareousFossil = n['CalcareousFossil']
        CalcareousFossilSub = ['Foraminifera', 'Ostracods', 'CalcareousNannofossils']
        for item in CalcareousFossilSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, CalcareousFossil))

        AgglutinatedFossil = n['AgglutinatedFossil']
        AgglutinatedFossilSub = ['AgglutinatedForaminifera']
        for item in AgglutinatedFossilSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, AgglutinatedFossil))    

        SiliceousFossil = n['SiliceousFossil']
        SiliceousFossilSub = ['Chitinozoans', 'PollenSpores', 'Acritarchs', 'Dinoflagellates', 'Palynomorphs']
        for item in SiliceousFossilSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, SiliceousFossil))  

        PhosphaticFossil = n['PhosphaticFossil']
        PhosphaticFossilSub = ['Conodonts']
        for item in PhosphaticFossilSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, PhosphaticFossil)) 

        OrganicWalledFossil = n['OrganicWalledFossil']
        OrganicWalledFossilSub = ['Radiolarians', 'Diatoms', 'Silicoflagellates']
        for item in OrganicWalledFossilSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, OrganicWalledFossil))   

    Rock = n['Rock']
    RockSub = ['IgneousRock', 'MetamorphicRock', 'SedimentaryRock']
    for className in RockSub:
        className = n[className]
        g.add((className, RDF.type, OWL.Class))
        g.add((className, RDFS.subClassOf, Rock))

        IgneousRock = n['IgneousRock']
        IgneousRockSub = ['IgneousExtrusiveRock', 'IgneousIntrusiveRock']
        for className in IgneousRockSub:
            className = n[className]
            g.add((className, RDF.type, OWL.Class))
            g.add((className, RDFS.subClassOf, IgneousRock))

            IgneousExtrusiveRock = n['IgneousExtrusiveRock']
            IgneousExtrusiveRockSub = ['Andesite', 'AndesiteLava', 'Basalt', 'Dacite', 'Obsidian', 'Pumice', 
                                       'Rhyolite', 'Scoria', 'Tuff', 'VolcanicDeposits', 'VolcanicTuff']
            for individual in IgneousExtrusiveRockSub:
                individual = n[individual]
                g.add((individual, RDF.type, OWL.NamedIndividual))
                g.add((individual, RDF.type, IgneousExtrusiveRock))

            IgneousIntrusiveRock = n['IgneousIntrusiveRock']
            IgneousIntrusiveRockSub = ['Diorite', 'Gabbro', 'Granite', 'Pegmatite', 'Peridotite']
            for individual in IgneousIntrusiveRockSub:
                individual = n[individual]
                g.add((individual, RDF.type, OWL.NamedIndividual))
                g.add((individual, RDF.type, IgneousIntrusiveRock))

        MetamorphicRock =n['MetamorphicRock']
        MetamorphicRockSub = ['MetamorphicFoliatedRock', 'MetamorphicNonFoliatedRock']
        for className in MetamorphicRockSub:
            className = n[className]
            g.add((className, RDF.type, OWL.Class))
            g.add((className, RDFS.subClassOf, MetamorphicRock))

        MetamorphicFoliatedRock =n['MetamorphicFoliatedRock']    
        MetamorphicFoliatedRockSub = ['Gneiss', 'Phyllite', 'Schist', 'Slate']
        for item in MetamorphicFoliatedRockSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, MetamorphicFoliatedRock))

        MetamorphicNonFoliatedRock =n['MetamorphicNonFoliatedRock']
        MetamorphicNonFoliatedRockSub = ['Marble', 'NonFoliatedRocks', 'Quartzite']
        for item in MetamorphicNonFoliatedRockSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, MetamorphicNonFoliatedRock))

        SedimentaryRock = n['SedimentaryRock']    
        SedimentaryRockSub = ['Anhydrite', 'Breccia', 'Carbonate', 'Chalk', 'Chert', 'Coal', 'CoalyShale', 'Conglomerate',
                              'Dolostone', 'Evaporites', 'Halite', 'Limestone', 'Marl', 'Mudstone', 'OrganicRichShale', 
                              'Salt', 'Sandstone', 'SedimentaryRocks', 'Shale', 'Siltstone', 'Spiculite']
        for item in SedimentaryRockSub:
            item = n[item]
            g.add((item, RDF.type, OWL.NamedIndividual))
            g.add((item, RDF.type, SedimentaryRock))

In [6]:
#Location class, GeoArea subclass and individuals

Location = n['Location']
g.add((Location, RDF.type, OWL.Class))

LocationSub = ['GeoArea']
for className in LocationSub:
    className = n[className]
    g.add((className, RDF.type, OWL.Class))
    g.add((className, RDFS.subClassOf, Location))   
    
GeoArea = n['GeoArea']
g.add((GeoArea, RDF.type, OWL.Class))

GeoAreaSub = ['CentralNorthSea', 'SouthernNorthSea', 'NorthernNorthSea']
for className in GeoAreaSub:
    className = n[className]
    g.add((className, RDF.type, OWL.NamedIndividual))
    g.add((className, RDF.type, GeoArea))

In [7]:
#GeochronologicTime class

GeochronologicTime = n['GeochronologicTime']
g.add((GeochronologicTime, RDF.type, OWL.Class))

GeochronologicTimeSub = ['PhanerozoicEon', 'GeologicalObject']
for className in GeochronologicTimeSub:
    className = n[className]
    g.add((className, RDF.type, OWL.Class))
    g.add((className, RDFS.subClassOf, GeochronologicTime)) 

    GeologicalObject = n['GeologicalObject']
    GeologicalObjectSub = ['GeologicalUnit']
    for className in GeologicalObjectSub:
        className = n[className]
        g.add((className, RDF.type, OWL.Class))
        g.add((className, RDFS.subClassOf, GeologicalObject))
        
        GeologicalUnit = n['GeologicalUnit']
        LithostratigraphicUnitSub = ['LithostratigraphicUnit']
        for className in LithostratigraphicUnitSub:
            className = n[className]
            g.add((className, RDF.type, OWL.Class))
            g.add((className, RDFS.subClassOf, GeologicalUnit))
            g.add((depositedIn, OWL.someValuesFrom, DepositionalEnvironment)) 
    
    PhanerozoicEon = n['PhanerozoicEon']
    PhanerozoicEonSub = ['MesozoicEra', 'PaleozoicEra', 'CenozoicEra']
    for className in PhanerozoicEonSub:
        className = n[className]
        g.add((className, RDF.type, OWL.Class))
        g.add((className, RDFS.subClassOf, PhanerozoicEon))
        
        MesozoicEra = n['MesozoicEra']
        MesozoicEraSub = ['CretaceousPeriod', 'JurassicPeriod', 'TriassicPeriod']
        for className in MesozoicEraSub:
            className = n[className]
            g.add((className, RDF.type, OWL.Class))
            g.add((className, RDFS.subClassOf, MesozoicEra))
            
            ##Cretaceous
            CretaceousPeriod = n['CretaceousPeriod']
            CretaceousPeriodSub = ['LowerCretaceousEpoch', 'UpperCretaceousEpoch']
            Cretaceous = n['Cretaceous']
            g.add((Cretaceous, RDF.type, OWL.NamedIndividual))
            g.add((Cretaceous, RDF.type, CretaceousPeriod))
            
            for className in CretaceousPeriodSub:
                className = n[className]
                g.add((className, RDF.type, OWL.Class))
                g.add((className, RDFS.subClassOf, CretaceousPeriod))
                
                LowerCretaceous = n['LowerCretaceous']
                LowerCretaceousEpoch = n['LowerCretaceousEpoch']
                g.add((LowerCretaceous, RDF.type, OWL.NamedIndividual))
                g.add((LowerCretaceous, RDF.type, LowerCretaceousEpoch))
                
                UpperCretaceous = n['UpperCretaceous']
                UpperCretaceousEpoch = n['UpperCretaceousEpoch']
                g.add((UpperCretaceous, RDF.type, OWL.NamedIndividual))
                g.add((UpperCretaceous, RDF.type, UpperCretaceousEpoch))

            JurassicPeriod = n['JurassicPeriod']
            JurassicPeriodSub = ['LowerJurassicEpoch', 'MiddleJurassicEpoch', 'UpperJurassicEpoch']
            Jurassic = n['Jurassic']
            g.add((Jurassic, RDF.type, OWL.NamedIndividual))
            g.add((Jurassic, RDF.type, JurassicPeriod))
            
            for className in JurassicPeriodSub:
                className = n[className]
                g.add((className, RDF.type, OWL.Class))
                g.add((className, RDFS.subClassOf, JurassicPeriod))
                
                LowerJurassic = n['LowerJurassic']
                LowerJurassicEpoch = n['LowerJurassicEpoch']
                g.add((LowerJurassic, RDF.type, OWL.NamedIndividual))
                g.add((LowerJurassic, RDF.type, LowerJurassicEpoch))
                
                MiddleJurassic = n['MiddleJurassic']
                MiddleJurassicEpoch = n['MiddleJurassicEpoch']
                g.add((MiddleJurassic, RDF.type, OWL.NamedIndividual))
                g.add((MiddleJurassic, RDF.type, MiddleJurassicEpoch))
                
                UpperJurassic = n['UpperJurassic']
                UpperJurassicEpoch = n['UpperJurassicEpoch']
                g.add((UpperJurassic, RDF.type, OWL.NamedIndividual))
                g.add((UpperJurassic, RDF.type, UpperJurassicEpoch))

            TriassicPeriod = n['TriassicPeriod']
            Triassic = n['Triassic']
            g.add((Triassic, RDF.type, OWL.NamedIndividual))
            g.add((Triassic, RDF.type, TriassicPeriod))
            
            TriassicPeriodSub = ['LowerTriassicEpoch', 'MiddleTriassicEpoch', 'UpperTriassicEpoch']
            for className in TriassicPeriodSub:
                className = n[className]
                g.add((className, RDF.type, OWL.Class))
                g.add((className, RDFS.subClassOf, TriassicPeriod))

                LowerTriassic = n['LowerTriassic']
                LowerTriassicEpoch = n['LowerTriassicEpoch']
                g.add((LowerTriassic, RDF.type, OWL.NamedIndividual))
                g.add((LowerTriassic, RDF.type, LowerTriassicEpoch))

                MiddleTriassic = n['MiddleTriassic']
                MiddleTriassicEpoch = n['MiddleTriassicEpoch']
                g.add((MiddleTriassic, RDF.type, OWL.NamedIndividual))
                g.add((MiddleTriassic, RDF.type, MiddleTriassicEpoch))

                UpperTriassic = n['UpperTriassic']
                UpperTriassicEpoch = n['UpperTriassicEpoch']
                g.add((UpperTriassic, RDF.type, OWL.NamedIndividual))
                g.add((UpperTriassic, RDF.type, UpperTriassicEpoch))
                
        PaleozoicEra = n['PaleozoicEra']
        PaleozoicEraSub = ['PermianPeriod', 'CarboniferousPeriod']
        for className in PaleozoicEraSub:
            className = n[className]
            g.add((className, RDF.type, OWL.Class))
            g.add((className, RDFS.subClassOf, PaleozoicEra))
            
            PermianPeriodSub = ['LowerPermian', 'UpperPermian', 'Permian']
            PermianPeriod = n['PermianPeriod']
            for indiName in PermianPeriodSub:
                indiName = n[indiName]
                g.add((indiName, RDF.type, OWL.NamedIndividual))
                g.add((indiName, RDF.type, PermianPeriod))
                
            CarboniferousPeriodSub = ['LowerCarboniferous', 'UpperCarboniferous', 'Carboniferous']
            CarboniferousPeriod = n['CarboniferousPeriod']
            for indiName in CarboniferousPeriodSub:
                indiName = n[indiName]
                g.add((indiName, RDF.type, OWL.NamedIndividual))
                g.add((indiName, RDF.type, CarboniferousPeriod))
                
        CenozoicEra = n['CenozoicEra']
        CenozoicEraSub = ['NeogenePeriod', 'PaleogenePeriod', 'QuaternaryPeriod']
        for className in CenozoicEraSub:
            className = n[className]
            g.add((className, RDF.type, OWL.Class))
            g.add((className, RDFS.subClassOf, CenozoicEra))
            
            PaleogenePeriodSub = ['Paleocene', 'Eocene', 'Oligocene']
            PaleogenePeriod = n['PaleogenePeriod']
            for indiName in PaleogenePeriodSub:
                indiName = n[indiName]
                g.add((indiName, RDF.type, OWL.NamedIndividual))
                g.add((indiName, RDF.type, PaleogenePeriod))
                
            NeogenePeriodSub = ['Neogene']
            NeogenePeriod = n['NeogenePeriod']
            for indiName in NeogenePeriodSub:
                indiName = n[indiName]
                g.add((indiName, RDF.type, OWL.NamedIndividual))
                g.add((indiName, RDF.type, NeogenePeriod))
                
            QuaternaryPeriodSub = ['Quaternary']
            QuaternaryPeriod = n['QuaternaryPeriod']
            for indiName in QuaternaryPeriodSub:
                indiName = n[indiName]
                g.add((indiName, RDF.type, OWL.NamedIndividual))
                g.add((indiName, RDF.type, QuaternaryPeriod))
                
           
#Add hasDirectNext relationship
g.add((LowerTriassic, hasDirectNext, MiddleTriassic))
g.add((MiddleTriassic, hasDirectNext, UpperTriassic))
g.add((UpperTriassic, hasDirectNext, LowerJurassic))
g.add((LowerJurassic, hasDirectNext, MiddleJurassic))
g.add((MiddleJurassic, hasDirectNext, UpperJurassic))
g.add((UpperJurassic, hasDirectNext, LowerCretaceous))
g.add((LowerCretaceous, hasDirectNext, UpperCretaceous))

## 2) Add classes and individuals to the knowledge graph from geolit_db.csv

In [8]:
# Import geolit_db.csv
import csv
reader = csv.DictReader(open('geolit_db.csv'))
geodict = []
for line in reader:
    geodict.append(line)

In [9]:
#Function that removed duplicates from a list
def remove_dups_list(items):
    return list(set(items))

#Extracting LithostratographicUnit individuals from dataset
lithoFm = []
lithoGp = []

for key in geodict:
    lithoFm.append(key['lithofm'])
    lithoGp.append(key['lithogp'])

individuals = remove_dups_list(lithoFm) + remove_dups_list(lithoGp)
#print(individuals)

In [10]:
## Create individuals and their relationships

for individual in individuals:
    individual = n[individual]
    g.add((individual, RDF.type, n['LithostratigraphicUnit']))
    g.add((individual, RDF.type, OWL.NamedIndividual))

for key in geodict:
    lithoFm = n[key['lithofm']]
    lithoGp = n[key['lithogp']]
    geoArea = n[key['geoarea1']]
    geoArea2 = n[key['geoarea2']]
    chronoAge = n[key['chrono']]
    rock1 = n[key['rock1']]
    rock2 = n[key['rock2']]
    rock3 = n[key['rock3']]
    fossil1 = n[key['fossils1']]
    fossil2 = n[key['fossils2']]
    depoMain = n[key['depo']]
    depo2 = n[key['depo2']]
    depo3 = n[key['depo3']]
    depo4 = n[key['depo4']]
        
    g.add((lithoFm, isMemberOf, lithoGp))
    g.add((lithoFm, hasAge, chronoAge))
    g.add((lithoFm, locatedIn, geoArea))
    g.add((lithoFm, constitutedBy, rock1))
    g.add((lithoFm, depositedIn, depoMain))
    
    
    if key['rock2'] == '':
        continue
    else:
        g.add((lithoFm, constitutedBy, rock2))
    
    if key['rock3'] == '':
        continue
    else:
        g.add((lithoFm, constitutedBy, rock3))
    
    if key['fossils1'] == '':
        continue
    else:
        g.add((lithoFm, contains, fossil1))
    
    if key['fossils2'] == '':
        continue
    else:
        g.add((lithoFm, contains, fossil2))
    
    if key['depo2'] == '':
        continue
    else:
        g.add((lithoFm, depositedIn, depo2))
    
    if key['depo3'] == '':
        continue
    else:
        g.add((lithoFm, depositedIn, depo3))
    
    if key['depo4'] == '':
        continue
    else:
        g.add((lithoFm, depositedIn, depo4))
        
    if key['geoArea2'] == '':
        continue
    else:
        g.add((lithoFm, locatedIn, geoArea2))
    
    #if key['geoArea2'] == '':
        #continue
    #else:
        #g.add((lithoFm, locatedIn, geoArea2))

In [11]:
# Size of graph
print(f'Graph has {len(g)}  facts')

Graph has 1216  facts


In [None]:
# Remove hash to view turtle formatted data of 1216 facts
#print(g.serialize(format='ttl').decode('u8'))

In [None]:
# Remove hash to view triples
#for index, (sub, pred, obj) in enumerate(g):
    #print(sub, pred, obj)
    #if index == 10:
        #break

In [13]:
# Write to ttl format
#g.serialize('geolit.ttl',format='ttl')

## 3) Extract triples from turtle file and remove URI

In [17]:
# Remove URI from triples using function
# From https://github.com/ernestojimenezruiz/tabular-data-semantics-py/blob/master/TabularSemantics/src/util/utilities.py

def getEntityName(uri):
    
    if "#" in uri:
        splits = uri.split("#")
        if len(splits[1])>0:
            return splits[1]
        else:
            return uri
        
    elif "/" in uri:
        splits = uri.split("/")
        
        i = len(splits)
        
        if len(splits[i-1])>0:
            return splits[i-1]
        else:
            return uri
        
    return uri

In [18]:
import numpy as np
import pandas as pd
geoT = []
for index, (sub, pred, obj) in enumerate(g):
    geoT.append((getEntityName(sub), getEntityName(pred), getEntityName(obj)))
    
#for row in geoTriples:
geoTriples_df = pd.DataFrame(geoT)

In [21]:
#Drop "object, predicate" rows for "type, CLass" and "type, NamedIndividual", along with other descriptors not required for triples data
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'Class') & (geoTriples_df.iloc[:,2] != 'Class')]
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'NamedIndividual') & (geoTriples_df.iloc[:,2] != 'NamedIndividual')]
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'hasNext') & (geoTriples_df.iloc[:,2] != 'hasNext')]
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'objectProperty') & (geoTriples_df.iloc[:,2] != 'objectProperty')]
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'isMemberOf') & (geoTriples_df.iloc[:,2] != 'isMemberOf')]
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'GeologicalObject') & (geoTriples_df.iloc[:,2] != 'GeologicalObject')]
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'hasDirectPrevious')] 
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'hasPrevious')] 
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'constitutedBy')] 
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'hasDirectNext')] 
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'depositedIn')] 
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'hasAge')] 
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'matter')]
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'GeoArea') & (geoTriples_df.iloc[:,2] != 'GeoArea')]
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'contains')]
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'location')]
geoTriples_df = geoTriples_df[(geoTriples_df.iloc[:,0] != 'locatedIn')]

In [22]:
# Size of new graph
print(f'Graph has {len(geoTriples_df)} facts')

Graph has 893 facts


In [None]:
# Change format of triple as required by BERT
geoTriples_df_formatted = geoTriples_df.replace(to_replace=r"([A-Z])", value= r"_\1", regex=True)
geoTriples_df_formatted[0] = geoTriples_df_formatted[0].str.lower()
geoTriples_df_formatted[1] = geoTriples_df_formatted[1].str.lower()
geoTriples_df_formatted[2] = geoTriples_df_formatted[2].str.lower()
geoTriples_df_formatted[0] = geoTriples_df_formatted[0].map(lambda x: x.lstrip('_'))
geoTriples_df_formatted[1] = geoTriples_df_formatted[1].map(lambda x: x.lstrip('_'))
geoTriples_df_formatted[2] = geoTriples_df_formatted[2].map(lambda x: x.lstrip('_'))

In [None]:
geoTriples_df_formatted[0] = geoTriples_df_formatted[0].str.replace(r"fm", r"formation")
geoTriples_df_formatted[0] = geoTriples_df_formatted[0].str.replace(r"gp", r"group")
geoTriples_df_formatted[0] = geoTriples_df_formatted[0].str.replace(r"mb", r"member")
geoTriples_df_formatted[0] = geoTriples_df_formatted[0].str.replace(r"env", r"environment")
geoTriples_df_formatted[0] = geoTriples_df_formatted[0].str.replace(r"environmentironment", r"environment")
geoTriples_df_formatted[1] = geoTriples_df_formatted[1].str.replace(r"env", r"environment")
geoTriples_df_formatted[1] = geoTriples_df_formatted[1].str.replace(r"environmentironment", r"environment")
geoTriples_df_formatted[2] = geoTriples_df_formatted[2].str.replace(r"fm", r"formation")
geoTriples_df_formatted[2] = geoTriples_df_formatted[2].str.replace(r"gp", r"group")
geoTriples_df_formatted[2] = geoTriples_df_formatted[2].str.replace(r"mb", r"member")
geoTriples_df_formatted[2] = geoTriples_df_formatted[2].str.replace(r"env", r"environment")
geoTriples_df_formatted[2] = geoTriples_df_formatted[2].str.replace(r"environmentironment", r"environment")

## 4) Knowledge graph metrics
Entity and relation density

In [49]:
def density(triples, entities, relations):
    ED = (2*triples) / entities
    RD = triples / relations
    print(f"Entities density: {ED}")
    print(f"Relations density: {RD}")

In [50]:
triples_count = len(geoTriples_df_formatted)

uniqueRelations = len(geoTriples_df_formatted[1].unique())

head_ents = geoTriples_df_formatted[0].unique()
tail_ents = geoTriples_df_formatted[2].unique()
entities = np.concatenate((head_ents, tail_ents))
uniqueEntities = len(np.unique(entities))
                     
print(f"Triples count: {triples_count}")
print(f"Unique entities: {uniqueEntities}")
print(f"Unique realtions: {uniqueRelations}")

Triples count: 893
Unique entities: 289
Unique realtions: 9


In [51]:
density(triples_count, uniqueEntities, uniqueRelations)

Entities density: 6.179930795847751
Relations density: 99.22222222222223


In [52]:
# Density of UMLS dataset used for KG-BERT paper
density(6529,135,46)

Entities density: 96.72592592592592
Relations density: 141.93478260869566


In [53]:
# Density of WN18RR dataset used for KG-BERT paper
density(93003,40943,11)

Entities density: 4.543047651613218
Relations density: 8454.818181818182


In [54]:
# Density of FB15k-237 dataset used for KG-BERT paper
density(310116,14541,237)

Entities density: 42.65401279141737
Relations density: 1308.506329113924


## 5) Split data to be evaluated into train, validation, test
Using ampligraph train_test_split_no_unseen functin

In [None]:
STOP
# This will build new train, valid and test set for modelling, overwriting the original

In [57]:
import ampligraph
from ampligraph.evaluation import train_test_split_no_unseen

test_valid, X_test = train_test_split_no_unseen(geoTriples_df_formatted.values, 179, seed=0) #Train/test split of 80/20 of total dataset

X_train, X_valid = train_test_split_no_unseen(test_valid, 71, seed=0) #Validation set 10% of training set

print('Total triples:', geoTriples_df_formatted.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (893, 3)
Size of train: (643, 3)
Size of valid: (71, 3)
Size of test: (179, 3)


In [58]:
X_train = pd.DataFrame(X_train)
X_valid = pd.DataFrame(X_valid)
X_test = pd.DataFrame(X_test)

In [59]:
#Create tsv files with tab seperation 
X_train.to_csv('geolit_data/train.tsv', sep='\t', index = False, header = False) 
X_valid.to_csv('geolit_data/dev.tsv', sep='\t', index = False, header = False) 
X_test.to_csv('geolit_data/test.tsv', sep='\t', index = False, header = False) 

## 6) Extract entities and relations for modelling

In [60]:
# Open train, valid and test datasets and extract unique entities list as text file

with open('geolit_data/train.tsv', 'r') as f, open('geolit_data/test.tsv', 'r') as f1, open('geolit_data/dev.tsv', 'r') as f2, open('geolit_data/entities.txt', 'w') as f3:
    lines = f.readlines() + f1.readlines() + f2.readlines()
    entities = set()
    for line in lines:
        line = line.strip()
        temp = line.split('\t')
        entities.add(temp[0])
        entities.add(temp[2])
    entities_str = '\n'.join(list(entities))
    f3.write(entities_str)

In [61]:
# Open train, valid and test datasets and extract unique relations list as text file

with open('geolit_data/train.tsv', 'r') as f, open('geolit_data/test.tsv', 'r') as f1, open('geolit_data/dev.tsv', 'r') as f2, open('geolit_data/relations.txt', 'w') as f3:
    relations = set()
    lines = f.readlines() + f1.readlines() + f2.readlines()
    for line in lines:
        line = line.strip()
        temp = line.split('\t')
        relations.add(temp[1])
    relations_str = '\n'.join(relations)
    f3.write(relations_str)

In [62]:
# Create a text file with entities as space separated text, as required by BERT

import re
with open('geolit_data/entities.txt', 'r') as f, open('geolit_data/entity2text.txt', 'w') as f1:
    lines = f.readlines()
    ent2texts = []
    for line in lines:
        line = line.strip()
        ent2texts.append(line + '\t' + line.replace('_', ' '))
    f1.write('\n'.join(ent2texts))

In [64]:
# Create a text file with relations as space separated text, as required by BERT
with open('geolit_data/relations.txt', 'r') as f, open('geolit_data/relation2text.txt', 'w') as f1:
    lines = f.readlines()
    relation_texts = []
    for line in lines:
        line = line.strip()
        #text = line.replace('/' , ' ')
        text = line.replace('_', ' ').strip()
        relation_texts.append(line + '\t' + text)
    f1.write('\n'.join(relation_texts))