In [1]:
import ijson
from tqdm import tqdm
import pandas as pd

import psycopg2
from psycopg2.errors import InFailedSqlTransaction

from pprint import pprint

In [2]:
# Postgresql config
config_postgres = {
    'host': 'localhost',
    'dbname': 'dblp',
    'user': 'postgres',
    'password': 'postgres',
    'port': 5432,
}

In [3]:
conn = psycopg2.connect(**config_postgres)
cur = conn.cursor()

In [28]:
# For reset
cur.execute('ROLLBACK')
conn.commit()

In [5]:
# For reset
if True:
    cur.execute("""
    DROP TABLE IF EXISTS keyword_rels CASCADE;
    DROP TABLE IF EXISTS authorships CASCADE;
    DROP TABLE IF EXISTS citations CASCADE;
    DROP TABLE IF EXISTS venues CASCADE;
    DROP TABLE IF EXISTS papers CASCADE;
    DROP TABLE IF EXISTS keywords CASCADE;
    DROP TABLE IF EXISTS authors CASCADE;
    """)

    conn.commit()

In [6]:
cur.execute("""
-- Venues
CREATE TABLE IF NOT EXISTS venues (
    vid VARCHAR(25) PRIMARY KEY,
    name TEXT,
    name_d TEXT NULL
);

-- Papers
CREATE TABLE IF NOT EXISTS papers (
    pid VARCHAR(25) PRIMARY KEY,
    title TEXT,
    year INT,
    vid VARCHAR(25) NULL,
    n_citation INT NULL,
    page_start TEXT NULL,
    page_end TEXT NULL,
    doc_type TEXT NULL,
    lang TEXT NULL,
    publisher TEXT NULL,
    volume TEXT NULL,
    issue TEXT NULL,
    issn TEXT NULL,
    isbn TEXT NULL,
    doi TEXT NULL,
    pdf TEXT NULL,
    url TEXT NULL,
    abstract TEXT NULL,
    FOREIGN KEY (vid) REFERENCES venues (vid)
);

-- References
CREATE TABLE IF NOT EXISTS citations (
    pid_from VARCHAR(25),
    pid_to VARCHAR(25),
    PRIMARY KEY (pid_from, pid_to)
    -- FOREIGN KEY (pid_from) REFERENCES papers (pid),
    -- FOREIGN KEY (pid_to) REFERENCES papers (pid)
);

-- Authors
CREATE TABLE IF NOT EXISTS authors (
	aid VARCHAR(25) PRIMARY KEY,
    name TEXT,
	bio TEXT NULL,
	email TEXT NULL,
    org TEXT NULL
);

-- Authorships (Paper-Author relationships)
CREATE TABLE IF NOT EXISTS authorships (
    pid VARCHAR(25),
    aid VARCHAR(25),
    order_forward INT,
    order_backward INT,
    num_authors INT,
    PRIMARY KEY (pid, aid),
    FOREIGN KEY (pid) REFERENCES papers (pid),
    FOREIGN KEY (aid) REFERENCES authors (aid)
);

-- Keywords
CREATE TABLE IF NOT EXISTS keywords (
    kid SERIAL PRIMARY KEY,
    keyword TEXT UNIQUE
);

-- Keyword relationships
CREATE TABLE IF NOT EXISTS keyword_rels (
    pid VARCHAR(25),
    kid SERIAL,
    PRIMARY KEY (pid, kid),
    FOREIGN KEY (pid) REFERENCES papers (pid),
    FOREIGN KEY (kid) REFERENCES keywords (kid)
);

CREATE INDEX IF NOT EXISTS venues_name_idx ON venues USING btree (name);
CREATE INDEX IF NOT EXISTS papers_title_idx ON papers USING btree (title);
CREATE INDEX IF NOT EXISTS authors_name_idx ON authors USING btree (name);
CREATE INDEX IF NOT EXISTS keywords_idx ON keywords USING btree (keyword);
""")

conn.commit()

In [7]:
rows_invalid = []
with tqdm(total = 5_354_309) as p:
    p.set_postfix({'# invalid': len(rows_invalid)})
    
    with open('data/dblp.v13_2.json', 'rb') as f:
        for ii, o in enumerate(ijson.items(f, 'item')):
            try:
                # If the title has no space within it, do not process the data.
                title = o['title'].strip() if 'title' in o else ''
                if ' ' not in title:
                    continue
                
                # Venue
                vid = None
                if 'venue' in o:
                    venue = o['venue']
                    vid = venue.get('_id')
                    if vid is not None:
                        cur.execute("""
                        INSERT INTO venues (vid, name, name_d)
                        VALUES (%s, %s, %s) ON CONFLICT DO NOTHING;
                        """, (vid, venue.get('raw'), venue.get('name_d', None)))

                pid = o.get('_id')
                paper = (
                    pid,
                    o.get('title'),
                    int(o.get('year')),
                    vid,
                    int(o.get('n_citation')) if 'n_citation' in o else None,
                    # o.get('page_start'),
                    # o.get('page_end'),
                    o.get('doctype'),
                    o.get('lang'),
                    o.get('publisher'),
                    o.get('volume'),
                    o.get('issue'),
                    o.get('issn'),
                    o.get('isbn'),
                    o.get('doi'),
                    o.get('pdf'),
                    o.get('url')[0] if 'url' in o and len(o['url']) > 0 else None,
                    o.get('abstract'),
                )
                cur.execute("""
                INSERT INTO papers
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON CONFLICT DO NOTHING;
                """, paper)
                conn.commit()

                # Authors & authorships
                if 'authors' in o and len(o['authors']) > 0:
                    authors = []
                    for author in o['authors']:
                        if author is None or author.get('_id') is None:
                            continue

                        authors.append((
                            author.get('_id'),
                            author.get('name'),
                            author.get('bio', None),
                            author.get('email', None),
                            author.get('org', None),
                        ))

                    cur.executemany("""
                    INSERT INTO authors (aid, name, bio, email, org)
                    VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING;
                    """, authors)

                    authorships = [
                        (pid, author[0], i + 1, len(authors) - i, len(authors))
                        for i, author in enumerate(authors)
                    ]

                    cur.executemany("""
                    INSERT INTO authorships (pid, aid, order_forward, order_backward, num_authors)
                    VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING;
                    """, authorships)
                    
                    conn.commit()

                # Keywords & Keyword relationships
                if 'keywords' in o:
                    # Use lowercase keywords of which length ranges from 1 to 100.
                    keywords = list(map(lambda x: x.lower().strip(), filter(lambda x: len(x) <= 100 and len(x) > 0, o['keywords'])))
                    
                    if keywords:
                        cur.executemany("""
                        INSERT INTO keywords (keyword)
                        VALUES (%s) ON CONFLICT DO NOTHING;
                        """, [(v,) for v in keywords])
                        conn.commit()

                        cur.executemany("""
                        INSERT INTO keyword_rels
                        SELECT %s AS pid, k.kid
                        FROM keywords k
                        WHERE k.keyword LIKE %s
                        ON CONFLICT DO NOTHING;
                        """, [(pid, v) for v in keywords])
                        conn.commit()
                
                if 'references' in o:
                    citations = [(pid, c) for c in o['references'] if c]
                    
                    cur.executemany("""
                    INSERT INTO citations (pid_from, pid_to) VALUES (%s, %s)
                    ON CONFLICT DO NOTHING;
                    """, citations)
                    conn.commit()
            
            except Exception as e:
                rows_invalid.append((o, e))
                cur.execute('ROLLBACK')
                conn.commit()
                p.set_postfix({'# invalid': len(rows_invalid)})
            
            p.update()

100%|█████████████████████████████████████████████████████████████▉| 5346819/5354309 [5:43:03<00:28, 259.77it/s, # invalid=407]


In [None]:
conn.close()