In [1]:
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import RDF, RDFS, OWL, XSD

from helper_funcs import *
from constants import *

In [2]:
g = Graph()

In [3]:
g.parse(ontology_fp, format="xml")

<Graph identifier=Nd986d8740c844731aff12092eed64236 (<class 'rdflib.graph.Graph'>)>

In [4]:
ont_iri = g.value(None, RDF.type, OWL.Ontology)
print("\nOntology IRI:", ont_iri)


Ontology IRI: http://www.semanticweb.org/kaushalamancherla/ontologies/2025/4/social_KG


In [5]:
ONTO = Namespace(ont_iri + "#")
g.bind("onto", ONTO)

In [6]:
DATA = Namespace(ont_iri + "/data/")
g.bind("data", DATA)

In [7]:
print("XSD base URI:", str(XSD))
print("OWL base URI:", str(OWL))
print("RDF base URI:", str(RDF))
print("DATA base URI:", str(DATA))
print("ONTO base URI:", str(ONTO))

XSD base URI: http://www.w3.org/2001/XMLSchema#
OWL base URI: http://www.w3.org/2002/07/owl#
RDF base URI: http://www.w3.org/1999/02/22-rdf-syntax-ns#
DATA base URI: http://www.semanticweb.org/kaushalamancherla/ontologies/2025/4/social_KG/data/
ONTO base URI: http://www.semanticweb.org/kaushalamancherla/ontologies/2025/4/social_KG#


In [8]:
g.bind("rdf",  RDF)
g.bind("rdfs", RDFS)
g.bind("owl",  OWL)
g.bind("xsd",  XSD)

In [9]:
raw_openalex_author_data = read_json_from_local(person_raw_openalex_data_fp)
raw_openalex_inst_data = read_json_from_local(inst_raw_openalex_data_fp)

In [10]:
###### HELPER FUNCTIONS #######
def mint_uri(*segments: str):
    """
    entity_type: e.g. "person" or "organization"
    local_id: the unique ID (e.g. OpenAlex ID without the URL prefix)
    """
    path = "/".join(segments)
    return URIRef(DATA + path)

In [11]:
def add_citation_and_work_properties(node_uri,node_id,counts_metadata,node_type):
    for rec in counts_metadata:
        yr = rec["year"]
    
        # Mint a URI for this year's citation metadata
        cnode = mint_uri(f"{node_type}/{node_id}/citationCountMetadata", str(yr))
    
        g.add((cnode, RDF.type, ONTO.CitationCountMetadata))
        g.add((cnode, ONTO.year,     Literal(yr, datatype=XSD.gYear)))
        g.add((cnode, ONTO.quantity, Literal(rec["cited_by_count"], datatype=XSD.integer)))
        g.add((node_uri, ONTO.hasCitationData, cnode))
    
        # Mint a URI for this year's works metadata
        wnode = mint_uri(f"{node_type}/{node_id}/worksCountMetadata", str(yr))

        g.add((wnode, RDF.type, ONTO.WorksCountMetadata))
        g.add((wnode, ONTO.year,     Literal(yr, datatype=XSD.gYear)))
        g.add((wnode, ONTO.quantity, Literal(rec["works_count"], datatype=XSD.integer)))
        g.add((node_uri, ONTO.hasWorksData, wnode))

In [12]:
###### GLOBAL CACHE FOR INSTITUTION DATA ######
global_inst_cache = {} #id -> URI for institution

In [13]:
def process_institution(i_id,inst_uri,metadata):
    # 1) mint and type the org node
    full_id  = metadata["id"]

    # 2) core metadata
    g.add((inst_uri, ONTO.openalex_id, Literal(full_id, datatype=XSD.string)))
    
    display_name = metadata["display_name"]
    g.add((inst_uri, ONTO.name, Literal(display_name)))
    
    # 3) flat data properties
    c = metadata.get("cited_by_count")
    g.add((inst_uri, ONTO.citedByCount, Literal(c, datatype=XSD.int)))
    
    if "grants_count" in metadata:
        grants = metadata["grants_count"]
        
        g.add((inst_uri, ONTO.grantsCount, Literal(grants, datatype=XSD.int)))

    # 4) summary_stats
    stats = metadata.get("summary_stats", {})
    
    if mean2 := stats.get("2yr_mean_citedness"):
        g.add((inst_uri, ONTO.twoYearMeanCitedness,Literal(mean2, datatype=XSD.double)))
        
    if h := stats.get("h_index"):
        g.add((inst_uri, ONTO.hIndex, Literal(h, datatype=XSD.int)))
        
    if i10 := stats.get("i10_index"):
        g.add((inst_uri, ONTO.i10Index, Literal(i10, datatype=XSD.int)))

    # 5) role-based works counts
    for role in metadata.get("roles", []):
        r = role.get("role")
        wc = role.get("works_count")
            
        if r == "publisher":
            g.add((inst_uri, ONTO.publisher_works_count,Literal(wc, datatype=XSD.int)))
        elif r == "funder":
            g.add((inst_uri, ONTO.funder_works_count,Literal(wc, datatype=XSD.int)))
        elif r == "institution":
            g.add((inst_uri, ONTO.institution_works_count,Literal(wc, datatype=XSD.int)))

    counts = metadata['counts_by_year']
    add_citation_and_work_properties(inst_uri,i_id,counts,"organization")

In [14]:
def process_affiliations(person_uri,affiliations,isForLast):
    for affiliation in affiliations: 
        if not isForLast:
            full_id = affiliation['institution']['id']
            years = affiliation.get("years", [])
        else:
            full_id = affiliation['id']
            
        parsed_id = full_id.rsplit("/", 1)[-1]
        person_local = str(person_uri).rsplit("/", 1)[-1]
        
        #1. ADD NODE TO GRAPH IF NOT ALREADY THERE
        if parsed_id not in global_inst_cache:
            inst_type = raw_openalex_inst_data[parsed_id]['inst_type']
            inst_uri = mint_uri("organization" if inst_type == ORG else "group",parsed_id)

            #add node into graph
            g.add((
                inst_uri,
                RDF.type,
                ONTO.Organization if inst_type == ORG else ONTO.Group
            ))

            #add the institution metadata to this node (data property, we only need to do this one time upon init)
            process_institution(parsed_id,inst_uri,raw_openalex_inst_data[parsed_id]['metadata'])

            #add to global cache
            global_inst_cache[parsed_id] = inst_uri
            
        inst_uri = global_inst_cache[parsed_id]
        
        #2. create the edges with the Affiliation class capturing the temporal dependencies
        if isForLast: #simply add (Person) -lastKnownInstitution-> global_inst_cache[parsed_id]
            g.add((person_uri, ONTO.lastKnownInstitution, inst_uri))
            continue

        #otherwise, we default to this case:
        #3. (Person) -worksFor-> Organization
        years = sorted(set(years))
        segments = []
        
        if years:
            seg_start = seg_prev = years[0]
            for y in years[1:]:
                if y == seg_prev + 1:
                    # still contiguous
                    seg_prev = y
                else:
                    # end of a run
                    segments.append((seg_start, seg_prev))
                    seg_start = seg_prev = y
                    
            segments.append((seg_start, seg_prev))
    
        # 3b.iii) for each segment, mint a distinct Affiliation node
        for start_year, end_year in segments:
            # URI-safe suffix, e.g. "2017-2019" or "2015"
            period = f"{start_year}-{end_year}" if start_year != end_year else f"{start_year}"
            aff_uri = mint_uri("affiliation", person_local, parsed_id, period)
        
            # a) type
            if (aff_uri, RDF.type, ONTO.Affiliation) not in g:
                g.add((aff_uri, RDF.type, ONTO.Affiliation))
        
            # b) timestamp props
            g.add((aff_uri, ONTO.affiliationStartYear,
                   Literal(start_year, datatype=XSD.gYear)))
            g.add((aff_uri, ONTO.affiliationEndYear,
                   Literal(end_year,   datatype=XSD.gYear)))
        
            g.add((person_uri, ONTO.hasAffiliation, aff_uri))
            
            if (aff_uri, ONTO.affiliatedOrganization, inst_uri) not in g:
                g.add((aff_uri, ONTO.affiliatedOrganization, inst_uri))

In [15]:
def parse_author_metadata(author,metadata_dict):
    ####### 1. CREATE PERSON NODE AND ADD THE SIMPLE PERSON DATA PROPERTIES ######
    local_id  = metadata_dict['id'].rsplit("/", 1)[-1]
    
    person_uri = mint_uri("person", local_id)
    #print(person_uri)
 
    g.add((person_uri, RDF.type, ONTO.Person)) #init node of type Person

    #attach data properties: openalexId, name, worksCount, citedByCount
    display_name = metadata_dict['display_name']
    works_count = metadata_dict['works_count']
    cited_by_count = metadata_dict['cited_by_count']
    
    g.add((person_uri, ONTO.openalex_id, Literal(local_id, datatype=XSD.string)))
    g.add((person_uri, ONTO.name, Literal(display_name)))
    g.add((person_uri, ONTO.worksCount, Literal(works_count, datatype=XSD.integer)))
    g.add((person_uri, ONTO.citedByCount, Literal(cited_by_count, datatype=XSD.integer)))

    summary_stats = metadata_dict["summary_stats"]

    # 1. Two‐year mean citedness → xsd:double
    if "2yr_mean_citedness" in summary_stats:
        g.add((person_uri,ONTO["2yr_mean_citedness"],Literal(summary_stats["2yr_mean_citedness"], datatype=XSD.double)))
    
    # 2. h‐index → xsd:integer
    if "h_index" in summary_stats:
        g.add((person_uri,ONTO.h_index,Literal(summary_stats["h_index"], datatype=XSD.integer)))
    
    # 3. i10‐index → xsd:integer
    if "i10_index" in summary_stats:
        g.add((person_uri,ONTO.i10_index,Literal(summary_stats["i10_index"], datatype=XSD.integer)))

    #### ADDING YEARLY WORKS AND YEARLY CITATIONS ####
    counts = metadata_dict['counts_by_year']
    add_citation_and_work_properties(person_uri,local_id,counts,"person")

    ##### PARSING INSTITUTION AFFILIATIONS ######
    '''
    This will require first instantiating an institution as either an organization or a group, then
    we attach the associated metadata with it. However, it is important to check if the node for the institution already exists
    Basically, what we do is if a node is new, attach it into the graph, and then append the necessary metadata to it.

    Lastly, we must get a list of all institutions that are currently in the graph (this is simply the organizations class as group is a subclass
    of organizations). When we have these nodes, for a given institution we are currently processing, intersect the list of those graph nodes and
    the list of organizations with the target relation (i.e. child, parent, etc.) and establish the necessary edge (subGroupOf, etc.)
    Now, we must note not to duplicately add edges (i.e. if such an edge already exists, leave it).
    
    Once those nodes are populated, we add the edges of - (Person) -worksFor-> Organization  and - (Person) -lastKnownInstitution-> Organization with the
    associated time stamps via the Affiliation class
    '''
    affiliations = metadata_dict['affiliations']
    last_known = metadata_dict['last_known_institutions']
    
    process_affiliations(person_uri,affiliations,False)
    process_affiliations(person_uri,last_known,True)

In [17]:
##### LOAD THE WORKS DATA #####

In [18]:
def parse_works_metadata():
    

In [None]:
for author,metadata_dict in raw_openalex_author_data.items():
    parse_author_metadata(author,metadata_dict)
    parse_works_metadata(