In [55]:
import json
import re
import uuid
import os
import os.path
import pandas as pd

In [56]:
SCHEMA = {
    'Dataset': {
        '@id': ['{dataSetName}'],
        '@fromJson': ['addExtraDatasetProperties'],
        '@toJson': ['getVariableArchiveTypes'],
        'dataSetName': { 
            'name': 'name', 
            'alternates': ['paleoArchiveName'] 
        },
        'originalDataURL': { 
            'name': 'hasLink', 
            'alternates': ['dataURL'] 
        },
        'dataContributor': {
            'name': 'author',
            'schema': 'Person',
            'alternates': ['whoEnteredinDB', 'MetadataEnteredByWhom'],
            'fromJson': 'parsePerson'
        },
        'archiveType': {
            'name': 'proxyArchiveType',
            'alternates':[
                'archive',
                'paleoDataArchive',
                'paleoData_Archive'
            ]
        },
        'changelog': {
            'name': 'hasChangeLog',
            'schema': 'ChangeLog'
        },
        'investigator': {
            'name': 'contributor',
            'schema': 'Person',
            'multiple': True,
            'fromJson': 'parsePersons'
        },
        'investigators': {
            'name': 'contributor',
            'schema': 'Person',
            'hack': True,
            'fromJson': 'parsePersonsString'
        },
        'funding': { 
            'name': 'fundedBy', 
            'multiple': True, 
            'schema': 'Funding' 
        },
        'pub': { 
            'name': 'publishedIn', 
            'multiple': True, 
            'schema': 'Publication' 
        },
        'geo': {
            'name': 'collectedFrom',
            'schema': 'Location',
            'fromJson': 'parseLocation',
            'toJson': 'locationToJson'
        },
        'paleoData': {
            'name': 'includesPaleoData',
            'multiple': True,
            'schema': 'PaleoData'
        },
        'chronData': {
            'name': 'includesChronData',
            'multiple': True,
            'schema': 'ChronData'
        },
        'googleSpreadSheetKey': {
            'name': 'hasSpreadsheetLink',
            'fromJson': 'getGoogleSpreadsheetURL',
            'toJson': 'getGoogleSpreadsheetKey'
        },
        'dataSetVersion': { 
            'name': 'datasetVersion' 
        }
    },
    'ChangeLog': {
        '@id': ['{@parent.@id}', '.ChangeLog.', '{@index}'],
        '@category': 'ChangeLog'
    },
    'Funding': {
        '@id': [
            '{fundingAgency|agency}',
            '.',
            '{fundingGrant|grant}'
        ],
        'agency': { 
            'name': 'fundingAgency', 
            'alternates': ['fundingAgency'] 
        },
        'grant': {
            'name': 'grantNumber',
            'multiple': True,
            'alternates': ['fundingGrant']
        },
        'country': {
            'name': 'fundingCountry',
            'alternates': ['fundingCountry']
        }
    },
    'Publication': {
        '@id': [
            'Publication.',
            '{identifier.0.id|@parent.dataSetName}',
            '{index}'
        ],
        '@fromJson': ['setIdentifierProperties'],
        '@toJson': ['createPublicationIdentifier'],
        'title': { 
            'name': 'title' 
        },
        'year': { 
            'name': 'publicationYear', 
            'alternates': ['pubYear'] 
        },
        'citation': { 
            'name': 'citation', 
            'type': 'string',
            'alternates': ['reference'] 
        },
        'link': { 
            'name': 'hasLink', 
            'multiple': True 
        },
        'author': {
            'name': 'author',
            'schema': 'Person',
            'multiple': True,
            'fromJson': 'parsePersons'
        },
        'authors': {
            'name': 'author',
            'schema': 'Person',
            'fromJson': 'parsePersonsString',
            'hack': True
        }
    },
    'PaleoData': {
        '@id': [
            '{@parent.dataSetName}',
            '.PaleoData',
            '{@index}'
        ],
        'paleoDataName': { 
            'name': 'name' 
        },
        'measurementTable': {
            'name': 'foundInMeasurementTable',
            'multiple': True,
            'schema': 'DataTable'
        },
        'paleoModel': {
            'name': 'paleoModeledBy',
            'multiple': True,
            'schema': 'Model',
            'category': 'PaleoModel'
        }
    },
    'ChronData': {
        '@id': [
            '{@parent.dataSetName}',
            '.ChronData',
            '{@index}'
        ],
        'measurementTable': {
            'name': 'foundInMeasurementTable',
            'multiple': True,
            'schema': 'DataTable'
        },
        'chronModel': {
            'name': 'chronModeledBy',
            'multiple': True,
            'schema': 'Model',
            'category': 'ChronModel'
        }
    },
    'Model': {
        '@id': ['{@parent.@id}', '.Model', '{@index}'],
        'method': { 
            'name': 'hasCode', 
            'schema': 'SoftwareCode' 
        },
        'summaryTable': {
            'name': 'foundInSummaryTable',
            'multiple': True,
            'schema': 'DataTable'
        },
        'ensembleTable': {
            'name': 'foundInEnsembleTable',
            'multiple': True,
            'schema': 'DataTable'
        },
        'distributionTable': {
            'name': 'foundInDistributionTable',
            'multiple': True,
            'schema': 'DataTable'
        }
    },
    'SoftwareCode': {
        '@id': {
            '{@parent.@id}',
            '.',
            '{name|software}'
        },
        'runCommand': { 
            'name': 'hasExecutionCommand' 
        },
        'runEnv': { 
            'name': 'hasExecutionEnvironment' 
        },
        'parameters': { 
            'type': 'string' 
        },
        'software': { 
            'name': 'name' 
        }
    },
    'DataTable': {
        '@id': ['{filename}', '_trunc(4)'],
        '@fromJson': ['setInterVariableLinks'],
        'filename': { 
            'name': 'hasFileName', 
            'type': 'File' 
        },
        'columns': {
            'name': 'includesVariable',
            'multiple': True,
            'schema': 'Variable'
        }
    },
    'Variable': {
        '@id': [
            '{foundInTable|@parent.@id}',
            '.',
            '{TSid|tsid}',
            '.',
            '{variableName|name}'
        ],
        '@fromJson': [
            'setVariableCategory',
            'wrapUncertainty',
            'createProxySystem',
            'addFoundInTable',
            'addVariableValues'
        ],
        '@toJson': [
            'setVariableType',
            'unwrapUncertainty',
            'extractFromProxySystem',
            'removeFoundInTable',
            'removeDepthProperty'
        ],
        'number': { 
            'name': 'hasColumnNumber', 
            'type': 'integer' 
        },
        'TSid': { 
            'name': 'hasVariableID', 
            'alternates': ['tsid'] 
        },
        'variableName': { 
            'name': 'name' 
        },
        'units': { 
            'name': 'hasUnits' 
        },
        'measurementMethod': { 
            'name': 'method' 
        },
        'measurementStandard': { 
            'name': 'standard' 
        },
        'missingValue': { 
            'name': 'hasMissingValue' 
        },
        'hasMaxValue': { 'type': 'float' },
        'hasMinValue': { 'type': 'float' },
        'hasMeanValue': { 'type': 'float' },
        'hasMedianValue': { 'type': 'float' },
        'instrument': {
            'name': 'measuredBy',
            'type': 'Individual',
            'category': 'Instrument'
        },
        'calibration': {
            'name': 'calibratedVia',
            'schema': 'CalibrationModel',
            'multiple': True
        },
        'interpretation': {
            'name': 'interpretedAs',
            'schema': 'Interpretation',
            'category': 'Interpretation',
            'multiple': True
        },
        'hasResolution': {
            'alternates': ['resolution'],
            'name': 'hasResolution',
            'category': 'Resolution',
            'schema': 'Resolution',
            'alternates': ['hasResolution']
        },
        'inferredFrom': { 
            'schema': 'Variable', 
            'category': 'MeasuredVariable' 
        },
        'hasUncertainty': { 
            'schema': 'Uncertainty', 
            'multiple': True 
        },
        'useInGlobalTemperatureAnalysis': { 
            'name': 'useInPAGES2kGlobalTemperatureAnalysis' 
        },
        'hasValues': {
            'type': 'string'
        },
        'foundInTable': {
            'type': 'Individual'
        },
        'hasProxySystem': {
            'type': 'Individual'
        },
        'takenAtDepth': {
            'type': 'Individual'
        }
    },
    'ProxySystemModel': {
        '@id': ['{@parent.@id}', '.ProxySystemModel'],
        'method': { 
            'name': 'hasCode', 
            'schema': 'SoftwareCode' 
        }
    },
    'PhysicalSample': {
        'hasidentifier': { 
            'name': 'hasIGSN' 
        },
        'hasname': { 
            'name': 'name' 
        },
        'housedat': { 
            'name': 'housedAt' 
        }
    },
    'Resolution': {
        '@id': ['{@parent.@id}', '.Resolution'],
        'hasMaxValue': { 'type': 'float' },
        'hasMinValue': { 'type': 'float' },
        'hasMeanValue': { 'type': 'float' },
        'hasMedianValue': { 'type': 'float' },        
        #'@fromJson': ['valuesToString'],
        #'@toJson': ['valuesToArray']
    },
    'Location': {
        '@id': ['{@parent.dataSetName}', '.Location'],
        'siteName': { 
            'name': 'name' 
        },
        'coordinates': { 
            'type': 'Geographic_coordinate' 
        },
        'coordinatesFor': { 
            'type': 'Individual' 
        }
    },
    'Interpretation': {
        '@id': [
            '{@parent.@id}',
            '.Interpretation',
            '{@index}'
        ],
        '@toJson': ['changeSeasonalityType'],
        'interpDirection': {
            'name': 'interpretationDirection',
            'alternates': [
                'dir',
                'interpDir',
                'interpdirection',
                'direction'
            ]
        },
        'variable': { 
            'name': 'name' 
        },
        'variableDetail': { 
            'name': 'detail', 
            'alternates': ['variabledetail'] 
        },
        'rank': { 'name': 'hasRank' },
        'basis': { 'name': 'relevantQuote' },
        'local': { 'name': 'isLocal' }
    },
    'IsotopeInterpretation': {
        '@id': {
            '{@parent.@id}',
            '.IsotopeInterpretation',
            '{@index}'
        },
        '@fromJson': ['wrapIntegrationTime'],
        '@toJson': ['unwrapIntegrationTime'],
        'integrationTime': {
            'name': 'hasIntegrationTime',
            'type': 'Individual',
            'schema': 'IntegrationTime'
        },
        'independentVariable': {
            'name': 'hasIndependentVariable',
            'schema': 'IndependentVariable',
            'multiple': True
        }
    },
    'IntegrationTime': {
        '@fromJson': ['wrapUncertainty'],
        '@toJson': ['unwrapUncertainty'],
        'basis': { 
            'name': 'relevantQuote' 
        },
        'units': { 
            'name': 'hasUnits' 
        },
        'independentVariable': {
            'name': 'hasIndependentVariable',
            'schema': 'IndependentVariable',
            'multiple': True
        }
    },
    'IndependentVariable': {
        '@id': {
            '{@parent.@id}',
            '.',
            '{name}'
        },
        'basis': { 
            'name': 'relevantQuote' 
        },
        'direction': {
            'name': 'interpretationDirection',
            'alternates': ['dir', 'interpDir', 'interpDirection']
        },
        'mathematicalRelation': { 
            'name': 'equation' 
        },
        'rank': { 
            'name': 'hasRank' 
        }
    },
    'CalibrationModel': {
        '@id': ['{@parent.@id}', '.Calibration'],
        '@fromJson': ['wrapUncertainty'],
        '@toJson': ['unwrapUncertainty'],
        'reference': { 
            'name': 'relevantQuote' 
        }
    },
    'Person': { '@id': ['{name}'] },
    'Uncertainty': {
        '@id': {
            '{@parent.@id}',
            '.Uncertainty',
            '{@index}'
        }
    }
}

In [57]:
BLACKLIST = {
    'metadataMD5' : 1,
    'paleoData_paleoDataMD5' : 1,
    'paleoData_paleoMeasurementTableMD5' : 1,
    'paleoDataMD5' : 1,
    'paleoMeasurementTableMD5' : 1,
    'tagMD5' : 1,
    'chronData_chronDataMD5' : 1,
    'chronData_chronMeasurementTableMD5' : 1,
    'chronDataMD5' : 1,
    'chronMeasurementTableMD5' : 1,
    'earliestSampleDate' : 1,
    'latestSampleDate' : 1,
    'inCompilation' : 1,
    'inCompilationBeta' : 1    
}

In [58]:
ONTONS = "http://linked.earth/ontology#"
NS = "http://linked.earth/lipd#"

In [59]:
TRIPLES = []

In [60]:
def ucfirst(s):
    return s[0].upper() + s[1:]

def lcfirst(s):
    return s[0].lower() + s[1:]

In [61]:
def addExtraDatasetProperties(obj, objhash) :
    _REQUEST = {} # This is not really used here is it ?
    for key,value in _REQUEST.items() :
        m = re.search(r"^extra_(.+)", key)
        if m is not None:
            prop = m.groups()[0]
            if (not (prop in obj)) :
                obj[prop] = value
    return [obj, objhash, []]

In [62]:
def parsePersonsString(authstring, parent = None) :
    authors = []
    if (type(authstring) is list) :
        return parsePersons(authstring, None)
    
    if (re.search(r"\s*\s*", authstring)) :
        auths = re.split(r"\s*\s*", authstring)
        for auth in auths:            
            authors.append(parsePerson(auth))
        
    else : 
        if (re.search(r".*,.*,.*", authstring)) :
            auths = re.split(r"\s*,\s*", authstring)
            i = 0
            while ( i < len(auths) ) :
                name = auths[i]
                if not re.search(r"\s", name) :
                    i+=1
                    name = str(str(auths[i]) + " ") + str(name)
                authors.append({"name" : name})
                i+=1
            
        else : 
            m = re.search(r"(.+),(.+)", authstring)
            if m is not None:
                authors.append({"name" : str(str(m.groups()[1]) + " ") + str(m.groups()[0])})
            else : 
                authors.append({"name" : authstring})
    return authors


def parsePerson(auth, parent = None) :
    authname = auth
    if (type(auth) is dict) :
        authname = auth["name"]
    m = re.search(r"(.+)\s*,\s*(.+)", authname)
    if m is not None:
        return {"name" : str(str(m.groups()[1]) + " ") + str(m.groups()[0])}
    else : 
        return {"name" : authname}

    
def parsePersons(auths, parent = None) :
    authors = []
    if (not type(auths) is list) :
        return None
    
    for auth in auths: 
        authors.append(parsePerson(auth, parent))
    return authors


In [63]:
def parseLocation(geo, parent = None) :
    ngeo = {}
    ngeo["locationType"] = geo["type"] if "type" in geo else None
    ngeo["coordinatesFor"] = parent["@id"]
    coords = geo["geometry"]["coordinates"]
    if (coords and len(coords) > 0) :
        ngeo["coordinates"] = str(str(coords[1]) + ",") + str(coords[0])
        ngeo["Wgs84:Lat"] = coords[1]
        ngeo["Wgs84:Long"] = coords[0]
        # FIXME: For now assuming points
        wkt = str(str("POINT(" + str(coords[1])) + " ") + str(coords[0])
        if (len(coords) > 2) :
            ngeo["Wgs84:Alt"] = coords[2]
            wkt += " " + str(coords[2])
        
        wkt += ")"
        ngeo["Geo:HasGeometry"] = {
            "@id" : str(parent["@id"]) + ".Geometry",
            "@category" : "Geo:Geometry",
            "Geo:AsWKT" : wkt
        }
    
    if "properties" in geo :
        for key,value in geo["properties"].items() :
            ngeo[key] = value
    return ngeo


def locationToJson(geo, parent = None) :
    geojson = {
        "geometry":
            {
                "coordinates" : [],
                "properties" : []
            }
    }
    if "coordinates" in geo :
        latlong = geo["coordinates"].split(",")
        geojson["geometry"]["coordinates"][0] = float(latlong[1])
        geojson["geometry"]["coordinates"][1] = float(latlong[0])
        geojson["geometry"]["type"] = "Point"
    
    if "wgs84:Long" in geo :
        geojson["geometry"]["coordinates"][0] = float(geo["wgs84:Long"])
    
    if "wgs84:Lat" in geo :
        geojson["geometry"]["coordinates"][1] = float(geo["wgs84:Lat"])
    
    if "wgs84:Alt" in geo :
        geojson["geometry"]["coordinates"][2] = float(geo["wgs84:Alt"])
    
    for prop,value in geo.items() :
        if prop[0] == "@" :
            continue
        
        if prop == "locationType" :
            geojson["type"] = geo["locationType"]
        else : 
            if prop == "coordinates" or prop == "coordinatesFor":
                # Ignore
                pass
            else : 
                if re.search(r"^(geo|wgs84):", prop) :
                    # Ignore
                    pass
                else : 
                    geojson["properties"][prop] = value
    
    return geojson

In [64]:
def getUncertainty(val, parent = None) :
    uncertainty = {}
    uncertainty["hasValue"] = val
    uncertainty["analytical"] = val
    uncertainty["reproducibility"] = val
    return uncertainty

In [65]:
def getGoogleSpreadsheetURL(key, parent = None) :
    return "https://docs.google.com/spreadsheets/d/" + str(key) + ""

def getGoogleSpreadsheetKey(url:str, parent = None) :
    return url.replace("https://docs.google.com/spreadsheets/d/", "")

In [66]:
def getParentProperty(obj, prop) :
    parent = obj["@parent"]
    while (parent) :
        if ((prop in parent)) :
            return parent[prop]
        
        parent = parent["@parent"]
    return None

def getParentWithPropertyValue(obj, prop, val) :
    parent = obj["@parent"]
    while (parent) :
        if ((prop in parent) and parent[prop] == val) :
            return parent
        
        parent = parent["@parent"]
    return None

In [67]:
def setIdentifierProperties(pub, objhash) :
    props = {}
    if "identifier" in pub :
        for identifier in pub["identifier"] : 
            if identifier["type"] == "doi" :
                if "hasDOI" not in pub:
                    pub["hasDOI"] = []  
                pub["hasDOI"].append(identifier["id"])
            else : 
                if identifier["type"] == "issn" :
                    if "hasISSN" not in pub:
                        pub["hasISSN"] = []                      
                    pub["hasISSN"].append(identifier["id"])
                elif identifier["type"] == "isbn" :
                    if "hasISBN" not in pub:
                        pub["hasISBN"] = []                          
                    pub["hasISBN"].append(identifier["id"])
            
            if (("url" in identifier)) :
                if "hasLink" not in pub:
                    pub["hasLink"] = []  
                pub["hasLink"].append(identifier["url"])

        del pub["identifier"]
    
    return [pub, objhash, []]

In [68]:
def valuesToString(obj, objhash) :
    if "values" in obj :
        if (type(obj["values"]) is list) :
            obj["values"] = ", ".join(obj["values"])
    return [obj, objhash, []]

def camelCase(id) :
    term = ""
    for subid in re.split(r"\s+", id): 
        term += ucfirst(subid)
    return term

def unCamelCase(id) :
    regex = r"(?<=[a-z])(?=[A-Z]) | (?<=[A-Z])(?=[A-Z][a-z])"
    a = re.split(regex, id)
    return " ".join(a).lower()

def fromCamelCase(str) :
    return ucfirst(str)
    #return ucfirst(str.replace(r"([^A-Z])([A-Z])"", "$1_$2", str))

In [69]:
def setVariableCategory(obj, objhash) :
    # Default category
    obj["@category"] = "MeasuredVariable"
    obj["@schema"] = "Variable"
    if (("variableType" in obj)) :
        varcat = str(obj["variableType"]) + "Variable"
        obj["@category"] = ucfirst(varcat)
        del obj["variableType"]
    else : 
        if (("calibration" in obj)) :
            obj["@category"] = "InferredVariable"
    return [obj, objhash, []]

In [70]:
def getLiPDArchiveType(archiveType) :
    return unCamelCase(archiveType)

def getArchiveType(id, latitude) :
    if not id:
        return None
    id = id.lower()
    if (id == "tree") :
        return "Wood"
    else : 
        if (id == "bivalve") :
            return "MolluskShell"
        else : 
            if (id == "borehole") :
                if (latitude > 65 or latitude < -65) :
                    return "GlacierIce"
                else : 
                    return "Rock"
    return camelCase(id)

In [71]:
def guessSensorType(archive, observation, sensor) :
    if (('sensorGenus' in sensor) or ('sensorSpecies' in sensor)) :
        if (archive == "MarineSediment") :
            return "Foraminifera"
        elif (archive == "Coral") :
            return "Polyp"
        elif (archive == "Wood") :
            return "Vegetation"
        elif (archive == "MolluskShell") :
            return "Bivalves"
        elif (archive == "Sclerosponge") :
            return "Sponge"
        return "OrganicSensor"
    else : 
        if (archive == "MarineSediment" and (observation == "Uk37" or observation == "Alkenone")) :
            type = "Coccolithophores"
        elif (archive == "MarineSediment" and observation == "TEX86") :
            type = "Archea"
        elif (archive == "MarineSediment" and observation == "D18O") :
            type = "Foraminifera"
        elif (archive == "MarineSediment" and observation == "Mg/Ca") :
            type = "Foraminifera"
        elif (archive == "LakeSediment" and (observation == "Uk37" or observation == "Alkenone")) :
            type = "Coccolithophores"
        elif (archive == "LakeSediment" and observation == "TEX86") :
            type = "Archea"
        elif (archive == "LakeSediment" and observation == "Midge") :
            type = "Chironomids"
        elif (archive == "LakeSediment" and observation == "BSi") :
            type = "Diatoms"
        elif (archive == "LakeSediment" and observation == "Chironomid") :
            type = "Chironomids"
        elif (archive == "LakeSediment" and observation == "Reflectance") :
            type = "PhotosyntheticAlgae"
        elif (archive == "LakeSediment" and observation == "Pollen") :
            type = "Watershed"
        elif (archive == "Coral") :
            return "Polyp"
        elif (archive == "Wood") :
            return "Vegetation"
        elif (archive == "MolluskShell") :
            return "Bivalves"
        elif (archive == "Sclerosponge") :
            return "Sponge"
        elif (archive == "Speleothem") :
            return "Karst"
        elif (archive == "GlacierIce") :
            return "Snow"
        elif (archive == "LakeSediment" and observation == "VarveThickness") :
            return "Catchment"
        elif (archive == "GlacierIce" and observation == "Melt") :
            return "IceSurface"
        elif (archive == "Borehole") :
            return "Soil"
        else : 
            return "InorganicSensor"

In [72]:
def getObservation(observation) :
    if observation is None:
        return None
    if (observation.lower() == "alkenone") :
        return "Uk37"
    return camelCase(observation)


In [73]:
def getVariableId(obj, parentid) :
    iobj = dict((k.lower(), v) for k, v in obj.items())
    id =  parentid + "." + iobj["tsid"]
    id += "." + str(iobj["variablename"])
    return id

In [74]:
def setInterVariableLinks(obj, objhash) :
    depthcol = None
    vobjhash = {}
    for col in obj["columns"] : 
        vobjhash[col["variableName"].lower()] = getVariableId(col, obj["@id"])
    
    depthcol =  vobjhash["depth"] if ("depth" in vobjhash) else None
    for col in obj["columns"] : 
        thiscol = getVariableId(col, obj["@id"])
        if (("inferredFrom" in col)) :
            infcol = col["inferredFrom"].lower()
            if ((infcol in vobjhash)) :
                col["inferredFrom"] = vobjhash[infcol]
            
        if (depthcol and thiscol != depthcol) :
            col["takenAtDepth"] = depthcol
    return [obj, objhash, []]

In [75]:
def removeDepthProperty(val, parent = None) :
    if (("takenAtDepth" in val)) :
        del val["takenAtDepth"]
    return val

In [76]:
def createProxySystem(obj, hash) :
    varid = obj["@id"]
    # Deal with proxies
    proxyobs = None
    sampleid = None
    if ("proxy" in obj) :
        proxyobs = obj["proxy"]
        del obj["proxy"]
    elif ("OnProxyObservationProperty" in obj) :
        proxyobs = obj["OnProxyObservationProperty"]
        del obj["OnProxyObservationProperty"]
    elif ("ProxyObservationType" in obj) :
        proxyobs = obj["ProxyObservationType"]
    
    vartype = obj["@category"]
    if (vartype and vartype == "MeasuredVariable") :
        # Get the archive type
        dsname = getParentProperty(obj, "dataSetName")
        geo = getParentProperty(obj, "geo")
        latitude = 0
        if (("geometry" in geo) and len(geo["geometry"]["coordinates"]) > 1) :
            latitude = geo["geometry"]["coordinates"][1]
        
        archivetype = getParentProperty(obj, "archiveType")
        if (not archivetype) :
            archivetype = getParentProperty(obj, "archive")
        
        archivetype = getArchiveType(archivetype, latitude)
        # Create sample (archive)
        if (not ("physicalSample" in obj)) :
            cname = getParentProperty(obj, "collectionName")
            if (cname) :
                obj["physicalSample"] = {"name" : cname}
            
        
        if (("physicalSample" in obj)) :
            sample = obj["physicalSample"]
            sampleid =  sample["hasname"] if ("hasname" in sample) else sample["name"]
            if (("hasidentifier" in sample)) :
                sampleid += "." + str(sample["hasidentifier"])
            else : 
                if (("identifier" in sample)) :
                    sampleid += "." + str(sample["identifier"])
            if (not (sampleid in hash)) :
                sampleobj = {
                    "@id" : sampleid, 
                    "@category" : "PhysicalSample", 
                    "@extracats" : [archivetype]
                }
                for pkey,pval in sample.items() :
                    sampleobj[pkey] = pval

                hash[sampleid] = sampleobj
            del obj["physicalSample"]
        
        observationid = getObservation(proxyobs)
        #obj["proxy"])
        # Create sensor
        sensorid = (str(observationid) if observationid is not None else "") + "DefaultSensor"
        sensor = {
            "@id" : sensorid, 
            "@category" : "Sensor"
        }
        if (("archiveGenus" in obj)) :
            sensor["sensorGenus"] = obj["archiveGenus"]
            sensorid = ucfirst(sensor["sensorGenus"].lower())
            del obj["archiveGenus"]
            if (("archiveSpecies" in obj)) :
                sensor["sensorSpecies"] = obj["archiveSpecies"]
                sensorid += " " + sensor["sensorSpecies"].lower()
                del obj["archiveSpecies"]
            
        
        if (("sensorGenus" in obj)) :
            sensor["sensorGenus"] = obj["sensorGenus"]
            sensorid = ucfirst(sensor["sensorGenus"].lower())
            del obj["sensorGenus"]
            if (("sensorSpecies" in obj)) :
                sensor["sensorSpecies"] = obj["sensorSpecies"]
                sensorid += " " + sensor["sensorSpecies"].lower()
                del obj["sensorSpecies"]
            
        
        if (not (sensorid in hash)) :
            sensor["@id"] = sensorid
            sensor["@category"] = guessSensorType(archivetype, observationid, sensor)
            hash[sensorid] = sensor
        
        #$hash[$sampleid]["ProxySensorType"] = $sensorid
        # Create a proxy
        #$proxyid = $obj["@id"].".$archivetype.$sensorid.ProxySystem"
        proxyid = "ProxySystem." + str(archivetype)
        if (sensorid) :
            proxyid += "." + str(sensorid) + ""
        
        if (observationid) :
            proxyid += "." + str(observationid) + ""
        
        # TODO: $proxyid .= ".$chronmodel"
        # TODO: $proxyid .= ".$paleomodel"
        if (not (proxyid in hash)) :
            proxy = {
                "@id" : proxyid, 
                "@category" : "ProxySystem", 
                "ProxySensorType" : sensorid,
                "ProxyArchiveType" : archivetype,
                "ProxyObservationType" : observationid
            }
            if (("proxySystemModel" in obj)) :
                proxymodelid = "" + str(proxyid) + ".Model"
                # TODO: Create proxy sensor/archive/observation models
                proxy = {
                    "@id" : proxymodelid, 
                    "@category" : "ProxySystemModel", 
                    "name" : observationid,
                    "hasProxySensorModel" : "" + str(sensorid) + ".Model",
                    "hasProxyArchiveModel" : "" + str(archivetype) + ".Model",
                    "hasProxyObservationModel" : "" + str(observationid) + ".Model"
                }
                proxy["modeledBy"] = proxymodelid
                hash[proxymodelid] = proxymodel
                del obj["proxySystemModel"]
            hash[proxyid] = proxy
        
        obj["measuredOn"] = sampleid
        obj["ProxyObservationType"] = observationid
        obj["hasProxySystem"] = proxyid
        if "proxy" in obj:
            del obj["proxy"]
        return [obj, hash, [sampleid, proxyid, sensorid]]
    
    return [obj, hash, []]

In [77]:
def wrapIntegrationTime(obj, objhash) :
    objid = obj["@id"]
    # Deal with integrationTime
    pvals = {}
    for key,value in obj.items() :
        if (re.search(r"^integrationTime\$", key, re.IGNORECASE)) :
            pvals["hasValue"] = value
            del obj[key]
        else:
            m = re.search(r"^integrationTime(.+)", key)
            if m is not None:
                nkey = m.groups()[0]
                nkey_lcfirst = lcfirst(nkey)
                pvals[nkey_lcfirst] = value
                del obj[key]

    if len(pvals.values()) > 0:
        intimeid = objid + '.IntegrationTime'
        obj['integrationTime'] = intimeid
        intime = {}
        intime['@id'] = intimeid
        intime['@category'] = 'IntegrationTime'
        intime['@schema'] = 'IntegrationTime'
        intime.update(pvals)
        objhash[intimeid] = intime
        return [obj, objhash, [intimeid]]
    
    return [obj, objhash, []]


In [78]:
def wrapUncertainty(obj, objhash) :
    objid = obj["@id"]
    # Deal with uncertainty
    pvals = {}
    keys_to_be_deleted = []
    for key,value in obj.items() :
        if (re.search(r"^uncertainty\$", key, re.IGNORECASE)) :
            pvals["hasValue"] = value
            keys_to_be_deleted.append(key)
        elif (re.search(r"^uncertainty", key, re.IGNORECASE)) :
            pvals[key] = value
            keys_to_be_deleted.append(key)

    for key in keys_to_be_deleted:
        del obj[key]

    if len(pvals.values()) > 0 :
        uncid = "" + str(objid) + ".Uncertainty"
        obj["hasUncertainty"] = uncid
        uncertainty = {
            "@id": uncid,
            "@category": "Uncertainty"
        }
        for prop,value in pvals.items() :
            uncertainty[prop] = value
        
        objhash[uncid] = uncertainty
        return [obj, objhash, [uncid]]
    
    return [obj, objhash, []]

In [79]:
def addFoundInTable(obj, objhash) :
    obj["foundInTable"] = obj["@parent"]["@id"]
    return [obj, objhash, []]

In [80]:
# Unroll the list to a rdf first/rest structure
def unrollValuesListToRDF(lst: list, dtype):
    bnodeid = "_:values" + uniqid()
    rdfns = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xsdns = "http://www.w3.org/2001/XMLSchema#"
    TRIPLES.append([
        bnodeid,
        f"<{rdfns}type>",
        f"<{rdfns}Seq>"
    ])
    for idx, item in enumerate(lst):
        TRIPLES.append([
            bnodeid,
            f"<{rdfns}_{idx+1}>",
            f"\"{item}\"^^<{xsdns}{dtype}>"
        ])
    return bnodeid

In [81]:
LIPD_CSVS = {}
def addVariableValues(obj, objhash) :
    csvname = obj["@parent"]["@id"] + ".csv"
    colnum = int(obj["number"]) - 1
    if csvname in LIPD_CSVS:
        df = LIPD_CSVS[csvname]
        col = df[colnum]
        values = col.tolist()
        dtype = "float" if col.dtypes == "float64" else "string"
        # TODO: Dumping to json string for now. 
        # rdf:Seq doesn't seem to be importing well in GraphDB
        obj["hasValues"] = json.dumps(values)
        #bnodeid = unrollValuesListToRDF(values, dtype)
        #obj["hasValues"] = bnodeid
        return [obj, objhash, []]
    return [obj, objhash, []]

### Object json reverse conversion

In [82]:
def removeFoundInTable(var, parent = None) :
    if (("foundInTable" in var)) :
        del var["foundInTable"]
    return var

### Testing Lipd Json to Ontology

In [83]:
def expandSchema() :
    xschema = {}
    for key,props in SCHEMA.items() :
        # Add core schema too
        corekey = str(key) + str("_" + str(WGCORE) + "")
        xschema[key] = props
        xschema[corekey] = props
        for lipdkey,pdetails in props.items() :
            if not type(pdetails) is dict:
                continue
            
            if (("alternates" in pdetails)) :
                for altkey in pdetails["alternates"]: 
                    xschema[key][altkey] = pdetails
                    xschema[corekey][altkey] = pdetails
    SCHEMA = xschema

In [84]:
def modifyStructureIfNeeded(obj, objhash, schema) :
    if (("@fromJson" in schema)) :
        for func in schema["@fromJson"]: 
            (obj, objhash, newids) = globals()[func](obj, objhash)
            for newid in newids : 
                if ((newid in objhash)) :
                    newobj = objhash[newid]
                    if (type(newobj) is dict) and ("@category" in newobj) :
                        newschid = newobj["@category"]
                        newschema =  SCHEMA[newschid] if (newschid in SCHEMA) else {}
                        (objhash[newid], objhash) = modifyStructureIfNeeded(newobj, objhash, newschema)
    
    return [obj, objhash]

In [85]:
import time, math, random

def uniqid(prefix='', more_entropy=False):
    m = time.time()
    sec = math.floor(m)
    usec = math.floor(1000000 * (m - sec))
    if more_entropy:
        lcg = random.random()
        the_uniqid = "%08x%05x%.8F" % (sec, usec, lcg * 10)
    else:
        the_uniqid = '%8x%05x' % (sec, usec)

    the_uniqid = (prefix if prefix else '') + the_uniqid
    return the_uniqid

In [86]:
def getCompoundKeyId(compound_key, obj) :
    tobj = obj
    for key in compound_key : 
        if ((type(tobj) is dict) and (key in tobj)) :
            tobj = tobj[key]
        else : 
            return None
        
    if not type(tobj) is dict:
        return tobj
    
    return None

In [87]:
def getBindingKeyId(key, obj) :
    key_options = key.split("|")
    for optkey in key_options : 
        compound_key = optkey.split(".")
        keyid = getCompoundKeyId(compound_key, obj)
        if (keyid) :
            return keyid
    return uniqid()

In [88]:
def getFunctionKeyId(fn, arg, curobjid) :
    if (fn == "trunc") :
        return curobjid[0:0 + len(curobjid) - int(arg)]
    elif (fn == "uniqid") :
        return str(curobjid) + uniqid(arg)
    return curobjid

In [89]:
def createIdFromPattern(pattern, obj) :
    objid = ""
    for key in pattern : 
        m = re.search(r"{(.+)}", key)
        if m and len(m.groups()) > 0 :
            objid += str(getBindingKeyId(m.groups()[0], obj))
        else : 
            m = re.search(r"_(.+)\((.*)\)", key)
            if m and len(m.groups()) > 1:
                fn = m.groups()[0]
                arg = m.groups()[1]
                objid = str(getFunctionKeyId(fn, arg, objid))
            else : 
                objid += str(key)
    return objid

In [90]:
def fixTitle(titleid) :
    return titleid.replace(r"@\\x{FFFD}@u", '_')

In [91]:
def getObjectId(obj, category, schema) :
    if type(obj) is dict:
        objid =  "Unknown." + uniqid(category)
    else:
        objid = ucfirst(obj).replace(" ", "_")
    if (("@id" in schema)) :
        objid = createIdFromPattern(schema["@id"], obj)
    
    return fixTitle(objid)

In [92]:
def mapLipdJson(obj, parent, index, category, schemaname, hash) :
    schema =  SCHEMA[schemaname] if (schemaname in SCHEMA) else {}
    SCHEMA[schemaname] = schema
    
    if not type(obj) is dict:
        return obj
    
    obj["@parent"] = parent
    obj["@index"] = index
    obj["@schema"] = schemaname
    
    objid = getObjectId(obj, category, schema)
    if (("@id" in obj)) :
        objid = obj["@id"]
    if ((objid in hash)) :
        return objid
    obj["@id"] = objid
    
    (obj, hash) = modifyStructureIfNeeded(obj, hash, schema)
    
    if ("@category" in obj) :
        category = obj["@category"]
    hash[objid] = {
        "@id": objid,
        "@category" : category,
        "@schema" : schemaname
    }
    item = hash[objid]
    
    if type(obj) is dict :
        for propkey,value in obj.items() :
            if (propkey[0] == "@") :
                continue
            
            if propkey in BLACKLIST :
                continue
            
            details = {}
            pname = propkey
            if propkey in schema :
                details = schema[propkey]
                pname =  details["name"] if ("name" in details) else propkey
            
            dtype =  details["type"] if ("type" in details) else None
            cat =  details["category"] if ("category" in details) else None
            sch =  details["schema"] if ("schema" in details) else None
            fromJson =  details["fromJson"] if ("fromJson" in details) else None
            subobject =  details["subobject"] if ("subobject" in details) else False
            if (sch and not cat) :
                cat = sch
            if (fromJson) :
                value = globals()[fromJson](value, obj)
                if (not value) :
                    continue
                
                if (pname) :
                    if (type(value) is list) :
                        index = 1
                        for subvalue in value: 
                            if (type(value) is dict):
                                if propkey not in item:
                                    item[propkey] = []
                                item[propkey].append(mapLipdJson(subvalue, obj, index, cat, sch, hash))
                                index+=1
                    else : 
                        if (type(value) is dict):
                            item[propkey] = mapLipdJson(value, obj, None, cat, sch, hash)
                        else : 
                            item[propkey] = value
                        
                    
                else : 
                    if (type(value) is dict):
                        for subpropkey,subvalue in value.items() :
                            item[subpropkey] = subvalue
                continue
            
            if (not pname) :
                continue
            
            if (subobject) :
                if "@subobjects" not in item:
                    item["@subobjects"] = []
                item["@subobjects"].append({ prop : value })
                continue
            
            if (type(value) is list):
                index = 1
                for subvalue in value: 
                    if propkey not in item:
                        item[propkey] = []                    
                    item[propkey].append(mapLipdJson(subvalue, obj, index, cat, sch, hash))
                    index+=1
                
            else : 
                if (type(value) is dict):
                    if propkey not in item:
                        item[propkey] = []                      
                    item[propkey].append(mapLipdJson(value, obj, None, cat, sch, hash))
                else : 
                    if (dtype == "Individual") :
                        item[propkey] = value
                        if (not (value in hash)) :
                            hash[value] = {
                                "@id" : value,
                                "@category" : cat,
                                "@schema" : sch
                            }
                    else : 
                        item[propkey] = value

    hash[objid] = item
    return objid

In [93]:
def guessDataValueType(val) :
    value = str(val)
    if (re.search(r"^-?\d+$", value)) :
        return "float" #"integer"
    
    if (re.search(r"^-?\d+\.\d+$", value)) :
        return "float"
    
    if (re.search(r"^[2][0-9]{3}[-][0-1][0-9][-][0-3][0-9]", value)) :
        return "date"
    
    if (re.search(r"^(true|false)$", value, re.IGNORECASE)) :
        return "boolean"
    
    if (re.search(r"^http", value)) :
        return "url"
    
    #if (re.search(r"^.+@.+\..+", value)) :
    #    return "Email"
    
    if (re.search(r"^\".+\"$", value)) :
        return "string"
    
    if (re.search(r"^'.+'$", value)) :
        return "string"
    
    return "string"

In [94]:
def guessValueType(value) :
    if value:
        if type(value) is list :
            for subvalue in value :
                return guessValueType(subvalue)
        elif type(value) is dict : 
            return "Individual"
        
        else : 
            valtype = guessDataValueType(value)
            return valtype

    return "string"

In [95]:
def getPropertyDetails(key, schema, value) :
    pname = fromCamelCase(key)
    details = {
        "name": pname
    }
    if (key in schema) and ("@@processed" in schema[key]) :
        return schema[key]
    
    # Get details from schema
    if (key in schema) :
        for skey,svalue in schema[key].items() :
            details[skey] = svalue
    
    if (("schema" in details)) :
        details["type"] = "Individual"
    
    pname = ucfirst(details["name"])
    
    # Get more details from the property definition (if it exists)
    """
    newname = resolveProperty(pname)
    if (newname) :
        details["type"] = getOntPropertyRange(newname)
        details["name"] = newname
    """ 
    
    if (not ("type" in details)) :
        details["type"] = guessValueType(value)
        if (not ("type" in details)) :
            details["type"] = "string"

    details["@@processed"] = True
    schema[key] = details
    return details

In [96]:
def sanitizeId(id):
    return re.sub(r"[^a-zA-Z0-9\-_\.]", "_", id)

In [97]:
# Create individual
def createIndividual(objid) :
    return NS + sanitizeId(objid)

In [98]:
# Create class
def createClass(category) :
    return ONTONS + sanitizeId(category)

In [99]:
# Create property
def createProperty(prop, dtype, cat, icon, multiple) :
    return [ ONTONS + lcfirst(sanitizeId(prop)), dtype ]

In [100]:
# Set individual classes
def setIndividualClasses(objid, category, extracats) :
    rdftype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
    if objid and category:
        TRIPLES.append([
            "<"+objid+">",
            "<"+rdftype+">",
            "<"+category+">"
        ])
    for ecat in extracats:
        if objid and ecat:
            TRIPLES.append([
                "<"+objid+">",
                "<"+rdftype+">",
                "<"+createClass(ecat)+">"
            ])

In [101]:
def escape( str ):
    str = str.replace("&", "&amp;")
    str = str.replace("<", "&lt;")
    str = str.replace(">", "&gt;")
    str = str.replace("\"", "\\\"")
    str = str.replace("\n", " ")
    str = str.replace("\r", " ")
    str = re.sub(r"\\$", "", str)
    return str

In [102]:
# Set property value
def setProperty( objid, prop, value ):
    if (type(value) is list) :
        for subvalue in value : 
            setProperty(objid, prop, subvalue)
        return

    (propid, dtype) = prop
    if objid and value:
        if re.search("^.*[^a-zA-Z]?nan[^a-zA-Z]?.*$", str(value).lower()):
            return
        if re.search("^.*[^a-zA-Z]?na[^a-zA-Z]?.*$", str(value).lower()):
            return
        
        if type(value) is str:
            value = escape(value)

        if dtype == "boolean":
            value = str(value).lower()
            if value != "true":
                value = "false"
        
        elif dtype == "float":
            m = re.search(r"(\-?\d+\.?\d*)", str(value))
            if m:
                value = m.group(1)
            else:
                value = 0.0

        elif dtype == "integer":
            m = re.search(r"(\-?\d+)", str(value))
            if m:
                value = m.group(1)
            else:
                value = 0

        if dtype == "Individual":
            value = createIndividual(value)
            value = "<" + value + ">"
        elif dtype == "List":
            value = value
        else:
            value = '"' + str(value) + '"' + "^^<http://www.w3.org/2001/XMLSchema#" + dtype + ">"
        
        TRIPLES.append([
            "<"+objid+">",
            "<"+propid+">",
            value
        ])

In [103]:
# Set subobject propvals
def setSubobjects(objid, subobjid, subpropvals, schema) :
    if (not subpropvals) :
        return
    
    subobjectid = str(objid) + "_" + str(subobjid)
    for pval in subpropvals : 
        for key,value in pval.items() :
            if (key[0] == "@") :
                continue
            
            details = getPropertyDetails(key, schema, value)
            prop = details["name"]
            type = details["type"]
            icon =  details["icon"] if ("icon" in details) else None
            cat =  details["category"] if ("category" in details) else None
            sch =  details["schema"] if ("schema" in details) else None
            fromJson =  details["fromJson"] if ("fromJson" in details) else None
            multiple =  details["multiple"] if ("multiple" in details) else False
            # Create & set Property
            propDI = createProperty(prop, type, cat, icon, multiple)
            setProperty(subobjectid, propDI, value)
            #print "|$key=$value"

    #print "\n"
    return

In [104]:
def createIndividualFull(obj) :
    category = obj["@category"]
    extracats =  obj["@extracats"] if ("@extracats" in obj) else {}
    schemaname =  obj["@schema"] if ("@schema" in obj) else category
    schema =  SCHEMA[schemaname] if (schemaname in SCHEMA) else {}
    objid = obj["@id"]
    if (not objid) :
        return
    
    subobjects = {}
    # Create category
    if (category) :
        category = createClass(category)
    
    objid = createIndividual(objid)
    
    # Set Individual classes
    setIndividualClasses(objid, category, extracats)
    
    for key,value in obj.items() :
        if (key[0] == "@") :
            continue
        
        details = getPropertyDetails(key, schema, value)
        prop = details["name"]
        dtype = details["type"]
        icon =  details["icon"] if ("icon" in details) else None
        cat =  details["category"] if ("category" in details) else None
        sch =  details["schema"] if ("schema" in details) else None
        if (sch and not cat) :
            cat = sch
        
        fromJson =  details["fromJson"] if ("fromJson" in details) else None
        multiple =  details["multiple"] if ("multiple" in details) else False
        subobject =  details["subobject"] if ("subobject" in details) else False
        if (not prop) :
            continue
        
        # Create Property
        propDI = createProperty(prop, dtype, cat, icon, multiple)
        
        # Set property value
        if (dtype == "Individual" or type(value) is dict) :
            setProperty(objid, propDI, value)
        else : 
            if (dtype == "File") :
                # Enable this ?
                """
                fileid = uploadFile(value)
                if (fileid) :
                    protectIndividual(fileid)
                    data = setProperty(data, propDI, fileid)
                """

            else : 
                setProperty(objid, propDI, value)


In [105]:
def find_files_with_extension(directory, extension):
    myregexobj = re.compile('\.'+extension+'$')
    try: 
        for entry in os.scandir(directory):
            if entry.is_file() and myregexobj.search(entry.path): 
                yield entry.path, entry.name
            elif entry.is_dir():   # if its a directory, then repeat process as a nested function
                yield from find_files_with_extension(entry.path, extension)
    except OSError as ose:
        print('Cannot access ' + directory +'. Probably a permissions error ', ose)
    except FileNotFoundError as fnf:
        print(directory +' not found ', fnf)

In [106]:
TRIPLES = []
def convertLipdJsonToRDF(jsonpath, rdfpath, url):
    TRIPLES.clear()    
    objhash = {}
    
    with open(jsonpath) as f:
        obj = json.load(f)
        obj["hasUrl"] = url
    
        mapLipdJson(obj, None, None, "Dataset", "Dataset", objhash)

        for key, item in objhash.items():
            createIndividualFull(item)

        with open(rdfpath, "w") as f:
            for triple in TRIPLES:
                f.write(" ".join(triple) + " .\n")
        

In [107]:
lipd_unzip_dir = "../data/unzipped"
rdfdir = "../data/rdf"

In [108]:
jsons = find_files_with_extension(lipd_unzip_dir, 'jsonld')

for jsonpath, _ in jsons:
    jsondir = os.path.dirname(jsonpath)
    csvs = find_files_with_extension(jsondir, 'csv')
    LIPD_CSVS = {}
    for csvpath, _ in csvs:
        csvname = os.path.basename(csvpath)        
        LIPD_CSVS[csvname] = pd.read_csv(csvpath, header=None)

    lipddir = os.path.dirname(os.path.dirname(jsondir))
    lipdname = os.path.basename(lipddir)
    catdir = os.path.dirname(lipddir)
    catname = os.path.basename(catdir)
    rdfcatdir = os.path.join(rdfdir, catname)
    if not os.path.exists(rdfcatdir):
        os.makedirs(rdfcatdir)
    rdfpath = os.path.join(rdfcatdir, lipdname+".nt")
    url = "https://data.mint.isi.edu/files/lipd/" + catname + "/" + lipdname

    NS = "http://linked.earth/lipd/" + catname + "#"    
    convertLipdJsonToRDF(jsonpath, rdfpath, url)
    
    print(".", flush=True, end='')



........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................