In [6]:
import pandas as pd, pathlib, re, math, pathlib
from slugify import slugify
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import RDF, RDFS, XSD


In [7]:
DATA = pathlib.Path("data")
PUB = Namespace("http://www.example.org/publication#")
RES = Namespace("http://www.example.org/resource/")
g   = Graph()
g.bind("pub", PUB)


aid  = lambda x: RES[f"author/{str(x).strip()}"]
pid  = lambda x: RES[f"paper/{str(x).strip()}"]
kid  = lambda x: RES[f"keyword/{slugify(str(x))}"]
jid  = lambda x: RES[f"journal/{slugify(str(x))}"]
vid  = lambda j,v: RES[f"journal/{slugify(str(j))}/vol/{int(v)}"]
eid  = lambda n: RES[f"event/{slugify(str(n))}"]
proc = lambda i: RES[f"proceeding/{str(i).strip()}"]


In [None]:
for _,r in pd.read_csv(DATA/"author_nodes.csv").iterrows():
    a = aid(r.authorId)
    g.add((a,RDF.type,PUB.Author))
    g.add((a, PUB.name,  Literal(str(r.name))))
    g.add((a, PUB.email, Literal(r.email)))

In [None]:
for _,r in pd.read_csv(DATA/"paper_nodes.csv").iterrows():
    p = pid(r.paperId)
    g.add((p,RDF.type,PUB.Paper))
    g.add((p,PUB.title,Literal(r.title)))
    g.add((p, PUB.abstractText,Literal(r.abstract)))
    g.add((p,PUB.pages,Literal(int(r.pages))))
    g.add((p,PUB.doi,Literal(str(r.doi))))
    g.add((p,PUB.url,Literal(r.url,datatype=XSD.anyURI)))
    g.add((p,PUB.citationCount,Literal(int(r.citationCount))))

In [None]:
pd.read_csv(DATA/"keyword_nodes.csv").keyword.apply(
    lambda kw: (g.add((kid(kw),RDF.type,PUB.Keyword)),
                g.add((kid(kw),RDFS.label,Literal(kw))))
)

0     (((http://www.example.org/resource/paper/35db1...
1     (((http://www.example.org/resource/paper/35db1...
2     (((http://www.example.org/resource/paper/35db1...
3     (((http://www.example.org/resource/paper/35db1...
4     (((http://www.example.org/resource/paper/35db1...
5     (((http://www.example.org/resource/paper/35db1...
6     (((http://www.example.org/resource/paper/35db1...
7     (((http://www.example.org/resource/paper/35db1...
8     (((http://www.example.org/resource/paper/35db1...
9     (((http://www.example.org/resource/paper/35db1...
10    (((http://www.example.org/resource/paper/35db1...
11    (((http://www.example.org/resource/paper/35db1...
12    (((http://www.example.org/resource/paper/35db1...
13    (((http://www.example.org/resource/paper/35db1...
14    (((http://www.example.org/resource/paper/35db1...
15    (((http://www.example.org/resource/paper/35db1...
16    (((http://www.example.org/resource/paper/35db1...
17    (((http://www.example.org/resource/paper/3

In [None]:
for _,r in pd.read_csv(DATA/"paper_has_keyword.csv").iterrows():
    g.add((pid(r.paperId),PUB.isAbout,kid(r.keyword)))


In [None]:
for _,r in pd.read_csv(DATA/"paper_cites_paper.csv").iterrows():
    g.add((pid(r.sourcePaperId),PUB.cites,pid(r.targetPaperId)))

In [9]:
for _,r in pd.read_csv(DATA/"author_writes_paper.csv").iterrows():
    a,p = aid(r.authorId), pid(r.paperId)
    g.add((a,PUB.writes,p))
    if str(r.corresponding_author).strip().lower()=="true":
        g.add((a,PUB.isCorrespondingAuthor,p))

In [10]:
for _,r in pd.read_csv(DATA/"author_reviews_paper.csv").iterrows():
    a,p = aid(r.authorId), pid(r.paperId)
    g.add((a,RDF.type,PUB.Reviewer))
    g.add((a,PUB.reviews,p))

In [None]:
for _, r in pd.read_csv(DATA / "paper_published_in.csv").iterrows():
    j   = jid(r.journalName)
    vol = vid(r.journalName, r.volume)
    p   = pid(r.paperId)

    g.add((j, RDF.type, PUB.Journal))
    g.add((vol, RDF.type, PUB.Volume))
    g.add((vol, PUB.belongsTo, j))

    year_val = pd.to_numeric(r.year, errors="coerce")
    if pd.notna(year_val):
        g.add((vol, PUB.volumeYear, Literal(int(year_val), datatype=XSD.gYear)))

    g.add((p, PUB.publishedIn, vol))

In [None]:
part_df = pd.read_csv(DATA/"proceeding_part_of.csv")
proc2event = {}
for _,r in part_df.iterrows():
    name = r.conferenceName.strip()
    ev_class = PUB.Workshop if re.search(r"workshop", name, re.I) else PUB.Conference
    ev  = eid(name)
    proc2event[r.proceedingId] = ev
    g.add((ev,RDF.type,ev_class))
    g.add((ev,RDFS.label,Literal(name)))

In [None]:
for _,r in pd.read_csv(DATA/"proceedings_nodes.csv").iterrows():
    prc = proc(r.proceedingId)
    g.add((prc,RDF.type,PUB.Proceedings))
    yr,city = str(r.year).strip(), str(r.city).strip()
    if yr and not math.isnan(float(yr)):
        g.add((prc, PUB.yearLiteral, Literal(int(float(yr)), datatype=XSD.gYear)))
    if city and city != 'nan':
        g.add((prc, PUB.cityLiteral, Literal(city)))
    ev = proc2event.get(r.proceedingId)
    if not ev:
        name = r.conferenceName.strip()
        ev_class = PUB.Workshop if re.search(r"workshop", name, re.I) else PUB.Conference
        ev = eid(name)
        g.add((ev,RDF.type,ev_class))
        g.add((ev,RDFS.label,Literal(name)))
    g.add((prc,PUB.isPartOf,ev))

In [14]:
for _,r in pd.read_csv(DATA/"paper_presented_in.csv").iterrows():
    g.add((pid(r.paperId),PUB.presentedIn,proc(r.proceedingId)))


In [None]:
name2author = { str(lbl): subj
                for subj,_,lbl in g.triples((None, PUB.name, None)) }

for _, row in pd.read_csv(DATA/"h_index.csv").iterrows():
    a_uri = name2author.get(str(row.authorName))
    if a_uri:
        g.add((a_uri, PUB.hIndex,
               Literal(int(row.hIndex), datatype=XSD.integer)))

In [16]:
out = pathlib.Path("publication_ABOX.ttl")
g.serialize(out,format="turtle")
print("A-Box written to",out)

A-Box written to publication_ABOX.ttl
