In [1]:
import pandas as pd, pathlib, re, math, pathlib
from slugify import slugify
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import RDF, RDFS, XSD


In [2]:
DATA = pathlib.Path("data")
PUB = Namespace("http://www.example.org/publication#")
RES = Namespace("http://www.example.org/resource/")
g   = Graph()
g.bind("pub", PUB)


aid  = lambda x: RES[f"author/{str(x).strip()}"]
pid  = lambda x: RES[f"paper/{str(x).strip()}"]
kid  = lambda x: RES[f"keyword/{slugify(str(x))}"]
jid  = lambda x: RES[f"journal/{slugify(str(x))}"]
vid  = lambda j,v: RES[f"journal/{slugify(str(j))}/vol/{int(v)}"]
eid  = lambda n: RES[f"event/{slugify(str(n))}"]
proc = lambda i: RES[f"proceeding/{str(i).strip()}"]


In [3]:
def add_triple(s, p, o):
    if (s, p, o) not in g:
        g.add((s, p, o))

In [4]:
for _,r in pd.read_csv(DATA/"author_nodes.csv").iterrows():
    a = aid(r.authorId)
    add_triple(a, PUB.name,  Literal(str(r.name)))
    add_triple(a, PUB.email, Literal(r.email))

In [5]:
for _,r in pd.read_csv(DATA/"paper_nodes.csv").iterrows():
    p = pid(r.paperId)
    add_triple(p,PUB.title,Literal(r.title))
    add_triple(p, PUB.abstractText,Literal(r.abstract))
    add_triple(p,PUB.pages,Literal(int(r.pages)))
    add_triple(p,PUB.doi,Literal(str(r.doi)))
    add_triple(p,PUB.url,Literal(r.url,datatype=XSD.anyURI))
    add_triple(p,PUB.citationCount,Literal(int(r.citationCount)))

In [6]:
keywords = pd.read_csv(DATA / "keyword_nodes.csv").keyword.tolist()

for kw in keywords:
    add_triple(kid(kw), RDFS.label, Literal(kw))

In [7]:
for _,r in pd.read_csv(DATA/"paper_has_keyword.csv").iterrows():
    add_triple(pid(r.paperId),PUB.isAbout,kid(r.keyword))


In [8]:
for _,r in pd.read_csv(DATA/"paper_cites_paper.csv").iterrows():
    add_triple(pid(r.sourcePaperId),PUB.cites,pid(r.targetPaperId))

In [9]:
for _,r in pd.read_csv(DATA/"author_writes_paper.csv").iterrows():
    a,p = aid(r.authorId), pid(r.paperId)
    add_triple(a,PUB.writes,p)
    if str(r.corresponding_author).strip().lower()=="true":
        add_triple(a,PUB.isCorrespondingAuthor,p)

In [10]:
for _,r in pd.read_csv(DATA/"author_reviews_paper.csv").iterrows():
    a,p = aid(r.authorId), pid(r.paperId)
    add_triple(a,PUB.reviews,p)

In [11]:
for _, r in pd.read_csv(DATA / "paper_published_in.csv").iterrows():
    j   = jid(r.journalName)
    vol = vid(r.journalName, r.volume)
    p   = pid(r.paperId)

    add_triple(vol, PUB.belongsTo, j)

    year_val = pd.to_numeric(r.year, errors="coerce")
    if pd.notna(year_val):
        add_triple(vol, PUB.volumeYear, Literal(int(year_val), datatype=XSD.gYear))

    add_triple(p, PUB.publishedIn, vol)

In [12]:
part_df = pd.read_csv(DATA/"proceeding_part_of.csv")
proc2event = {}
for _,r in part_df.iterrows():
    name = r.conferenceName.strip()
    ev_class = PUB.Workshop if re.search(r"workshop", name, re.I) else PUB.Conference
    ev  = eid(name)
    proc2event[r.proceedingId] = ev
    add_triple(ev,RDF.type,ev_class)
    add_triple(ev,RDFS.label,Literal(name))

In [13]:
for _,r in pd.read_csv(DATA/"proceedings_nodes.csv").iterrows():
    prc = proc(r.proceedingId)
    yr,city = str(r.year).strip(), str(r.city).strip()
    if yr and not math.isnan(float(yr)):
        add_triple(prc, PUB.yearLiteral, Literal(int(float(yr)), datatype=XSD.gYear))
    if city and city != 'nan':
        add_triple(prc, PUB.cityLiteral, Literal(city))
    ev = proc2event.get(r.proceedingId)
    if not ev:
        name = r.conferenceName.strip()
        ev_class = PUB.Workshop if re.search(r"workshop", name, re.I) else PUB.Conference
        ev = eid(name)
        add_triple(ev,RDF.type,ev_class)
        add_triple(ev,RDFS.label,Literal(name))
    add_triple(prc,PUB.isPartOf,ev)

In [14]:
for _,r in pd.read_csv(DATA/"paper_presented_in.csv").iterrows():
    add_triple(pid(r.paperId),PUB.presentedIn,proc(r.proceedingId))


In [15]:
out = pathlib.Path("publication_ABOX.ttl")
g.serialize(out,format="turtle")
print("A-Box written to",out)

A-Box written to publication_ABOX.ttl
