# Download the latest PapersWithCode Dataset and perform some preprocessing
This is the first notebook in the AI Dictionary project. It downloads the latest version of the PapersWithCode dataset and performs some preprocessing. It outputs several JSON files that are then used by the helper script to enrich with generated keywords, embeddings and data from OpenAlex.

In [5]:
import os, sys, re
import requests
import gzip
import shutil
import hashlib
from tqdm.notebook import trange, tqdm
import pandas as pd

In [6]:
# Some Control Variables

DOWNLOAD_PWC_DATA = True # If set to true, the PWC data will be downloaded. If files are already there, they will be deleted and downloaded again.

PWC_DATA_PATH = "data/pwc/" # If this is set to none, the JSON files will be downloaded to the data directory
PWC_API_PATH = "data/pwc_api/" # If this is set to none, the JSON files will be downloaded to the data directory
PWC_PROCESSED_JSON_PATH = "data/pwc_processed_json/"
NEO4J_PATH = "data/neo4j/"
NEO4J_DB_NAME = "pwa1"

if not os.path.exists(PWC_API_PATH):
    os.makedirs(PWC_API_PATH)
if not os.path.exists(PWC_DATA_PATH):
    os.makedirs(PWC_DATA_PATH)
if not os.path.exists(NEO4J_PATH):
    os.makedirs(NEO4J_PATH)
if not os.path.exists(PWC_PROCESSED_JSON_PATH):
    os.makedirs(PWC_PROCESSED_JSON_PATH)

In [7]:
## ONLY EXECUTE IF YOU WANT TO DOWNLOAD THE JSON FILES FROM PAPERSWITHCODE AGAIN!
# Download all methods from PapersWithCode at https://production-media.paperswithcode.com/about/methods.json.gz and load it into a dataframe
# First load the json from the website into a folder called 'data'. Check if it exists, if not create it.

if DOWNLOAD_PWC_DATA:
    print("Downloading JSON files from PapersWithCode")
    # If there are files in the folder, delete them
    if os.path.exists(PWC_DATA_PATH + 'papers.json'):
        print("Deleting old papers.json")
        os.remove(PWC_DATA_PATH + 'papers.json')
    if os.path.exists(PWC_DATA_PATH + 'methods.json'):
        print("Deleting old methods.json")
        os.remove(PWC_DATA_PATH + 'methods.json')
    if os.path.exists(PWC_DATA_PATH + 'repos.json'):
        print("Deleting old repos.json")
        os.remove(PWC_DATA_PATH + 'repos.json')
    if os.path.exists(PWC_DATA_PATH + 'evaluation_tables.json'):
        print("Deleting old evaluation_tables.json")
        os.remove(PWC_DATA_PATH + 'evaluation_tables.json')
    if os.path.exists(PWC_DATA_PATH + 'datasets.json'):
        print("Deleting old datasets.json")
        os.remove(PWC_DATA_PATH + 'datasets.json')

    # Download the json file from the website - Check https://paperswithcode.com/about if the link is still valid
    papers_url = 'https://production-media.paperswithcode.com/about/papers-with-abstracts.json.gz'
    methods_url = 'https://production-media.paperswithcode.com/about/methods.json.gz'
    repos_url = 'https://production-media.paperswithcode.com/about/links-between-papers-and-code.json.gz'
    evaluation_tables_url = 'https://production-media.paperswithcode.com/about/evaluation-tables.json.gz'
    datasets_url = 'https://production-media.paperswithcode.com/about/datasets.json.gz'

    papers_output_path = PWC_DATA_PATH + 'papers.json.gz'
    methods_output_path = PWC_DATA_PATH + 'methods.json.gz'
    repos_output_path = PWC_DATA_PATH + 'repos.json.gz'
    evaluation_tables_output_path = PWC_DATA_PATH + 'evaluation_tables.json.gz'
    datasets_output_path = PWC_DATA_PATH + 'datasets.json.gz'

    papers_response = requests.get(papers_url)
    if papers_response.status_code == 200:
        with open(papers_output_path, 'wb') as f:
            f.write(papers_response.content)
        print(f"Downloaded papers.json.gz. Size is {round(os.path.getsize(papers_output_path) / 1000000, 2)} MB")
        with gzip.open(PWC_DATA_PATH + 'papers.json.gz', 'rb') as f_in:
            with open(PWC_DATA_PATH + 'papers.json', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(PWC_DATA_PATH + 'papers.json.gz')
        print(f"Unzipped papers.json.gz. Size is {round(os.path.getsize(PWC_DATA_PATH + 'papers.json') / 1000000, 2)} MB")
        print("")
    else:
        print("Could not download papers.json.gz")
        print(f"Status code: {papers_response.status_code}")
        
    methods_response = requests.get(methods_url)
    if methods_response.status_code == 200:
        with open(methods_output_path, 'wb') as f:
            f.write(methods_response.content)
        print(f"Downloaded methods.json.gz. Size is {round(os.path.getsize(methods_output_path) / 1000000, 2)} MB")
        with gzip.open(PWC_DATA_PATH + 'methods.json.gz', 'rb') as f_in:
            with open(PWC_DATA_PATH + 'methods.json', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(PWC_DATA_PATH + 'methods.json.gz')
        print(f"Unzipped methods.json.gz. Size is {round(os.path.getsize(PWC_DATA_PATH + 'methods.json') / 1000000, 2)} MB")
        print("")
    else:
        print("Could not download methods.json.gz")
        print(f"Status code: {methods_response.status_code}")
        
    repos_response = requests.get(repos_url)
    if repos_response.status_code == 200:
        with open(repos_output_path, 'wb') as f:
            f.write(repos_response.content)
        print(f"Downloaded repos.json.gz. Size is {round(os.path.getsize(repos_output_path) / 1000000, 2)} MB")
        with gzip.open(PWC_DATA_PATH + 'repos.json.gz', 'rb') as f_in:
            with open(PWC_DATA_PATH + 'repos.json', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(PWC_DATA_PATH + 'repos.json.gz')
        print(f"Unzipped repos.json.gz. Size is {round(os.path.getsize(PWC_DATA_PATH + 'repos.json') / 1000000, 2)} MB")
        print("")
    else:
        print("Could not download repos.json.gz")
        print(f"Status code: {repos_response.status_code}")
        
    evaluation_tables_response = requests.get(evaluation_tables_url)
    if evaluation_tables_response.status_code == 200:
        with open(evaluation_tables_output_path, 'wb') as f:
            f.write(evaluation_tables_response.content)
        print(f"Downloaded evaluation_tables.json.gz. Size is {round(os.path.getsize(evaluation_tables_output_path) / 1000000, 2)} MB")
        with gzip.open(PWC_DATA_PATH + 'evaluation_tables.json.gz', 'rb') as f_in:
            with open(PWC_DATA_PATH + 'evaluation_tables.json', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(PWC_DATA_PATH + 'evaluation_tables.json.gz')
        print(f"Unzipped evaluation_tables.json.gz. Size is {round(os.path.getsize(PWC_DATA_PATH + 'evaluation_tables.json') / 1000000, 2)} MB")
        print("")
    else:
        print("Could not download evaluation_tables.json.gz")
        print(f"Status code: {evaluation_tables_response.status_code}")
        
    datasets_response = requests.get(datasets_url)
    if datasets_response.status_code == 200:
        with open(datasets_output_path, 'wb') as f:
            f.write(datasets_response.content)
        print(f"Downloaded datasets.json.gz. Size is {round(os.path.getsize(datasets_output_path) / 1000000, 2)} MB")
        with gzip.open(PWC_DATA_PATH + 'datasets.json.gz', 'rb') as f_in:
            with open(PWC_DATA_PATH + 'datasets.json', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(PWC_DATA_PATH + 'datasets.json.gz')
        print(f"Unzipped datasets.json.gz. Size is {round(os.path.getsize(PWC_DATA_PATH + 'datasets.json') / 1000000, 2)} MB")
        print("")
    else:
        print("Could not download datasets.json.gz")
        print(f"Status code: {datasets_response.status_code}")

    print("Downloaded JSON files from PapersWithCode and saved them to data/pwc")
else:
    print("Not downloading JSON files from PapersWithCode")
    # Check if the files are already there
    if not os.path.exists(PWC_DATA_PATH + 'papers.json'):
        print("JSON files seem to not exist - set the download variable to True or provide the files here...")
        sys.exit()

Downloading JSON files from PapersWithCode
Deleting old papers.json
Deleting old methods.json
Deleting old repos.json
Deleting old evaluation_tables.json
Deleting old datasets.json
Downloaded papers.json.gz. Size is 342.64 MB
Unzipped papers.json.gz. Size is 1335.52 MB

Downloaded methods.json.gz. Size is 0.75 MB
Unzipped methods.json.gz. Size is 3.16 MB

Downloaded repos.json.gz. Size is 18.33 MB
Unzipped repos.json.gz. Size is 114.14 MB

Downloaded evaluation_tables.json.gz. Size is 13.69 MB
Unzipped evaluation_tables.json.gz. Size is 165.69 MB

Downloaded datasets.json.gz. Size is 3.03 MB
Unzipped datasets.json.gz. Size is 13.87 MB

Downloaded JSON files from PapersWithCode and saved them to data/pwc


In [8]:
if DOWNLOAD_PWC_DATA:
    # Get areas and tasks from the PWC API
    ITEMS_PER_PAGE = 100

    # Get all areas
    print("Getting all areas...")
    areas = requests.get("https://paperswithcode.com/api/v1/areas").json()
    areas = pd.DataFrame(areas["results"])
    print("Done.")

    # Get all tasks from all areas and write them to the datafile
    print("Getting all tasks...")
    # Make a new column tasks
    areas["tasks"] = areas.id.apply(lambda x: [])
    page = 1
    for area_id in tqdm(areas.id):
        # Make a first request to get the count
        count = requests.get(f"https://paperswithcode.com/api/v1/areas/{area_id}/tasks?items_per_page=1").json()["count"]
        print(f"Getting tasks from area {area_id}. Got {count} tasks...")
        for page in trange(1, int(count / ITEMS_PER_PAGE) + 2):
            try:
                response = requests.get(f"https://paperswithcode.com/api/v1/areas/{area_id}/tasks?items_per_page={ITEMS_PER_PAGE}&page={page}").json()
                # Append the results to a list in a new tasks column
                if response["results"] is not None:
                    areas.loc[areas.id == area_id, "tasks"] = areas.loc[areas.id == area_id, "tasks"].apply(lambda x: x + response["results"])
                else:
                    print(f"Error getting tasks from area {area_id}.")
                    break

                page += 1
            except Exception as e:
                print(f"Error getting tasks from area {area_id}.")
                print(e)
                print(response)
    print("Done.")

    # Write areas to json file
    areas.to_json(PWC_API_PATH + "areas.json", orient="records")

    print(f"Saved {len(areas)} areas with corresponding tasks to {PWC_API_PATH}areas.json.")
else:
    print("Not downloading areas and tasks from PapersWithCode")
    # Check if the files are already there
    if not os.path.exists(PWC_API_PATH + 'areas.json'):
        print("JSON files seem to not exist - set the download variable to True or provide the files here...")
        sys.exit()

Getting all areas...
Done.
Getting all tasks...


  0%|          | 0/16 [00:00<?, ?it/s]

Getting tasks from area adversarial. Got 21 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Getting tasks from area audio. Got 67 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Getting tasks from area computer-code. Got 59 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Getting tasks from area computer-vision. Got 1304 tasks...


  0%|          | 0/14 [00:00<?, ?it/s]

Getting tasks from area graphs. Got 83 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Getting tasks from area knowledge-base. Got 33 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Getting tasks from area medical. Got 252 tasks...


  0%|          | 0/3 [00:00<?, ?it/s]

Getting tasks from area methodology. Got 178 tasks...


  0%|          | 0/2 [00:00<?, ?it/s]

Getting tasks from area miscellaneous. Got 246 tasks...


  0%|          | 0/3 [00:00<?, ?it/s]

Getting tasks from area music. Got 24 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Getting tasks from area natural-language-processing. Got 632 tasks...


  0%|          | 0/7 [00:00<?, ?it/s]

Getting tasks from area playing-games. Got 42 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Getting tasks from area reasoning. Got 63 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Getting tasks from area robots. Got 42 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Getting tasks from area speech. Got 77 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Getting tasks from area time-series. Got 92 tasks...


  0%|          | 0/1 [00:00<?, ?it/s]

Done.
Saved 16 areas with corresponding tasks to data/pwc_api/areas.json.


In [9]:
# Load the JSON files into dataframes
print("Loading JSON files into dataframes...")
papers_df = pd.read_json(PWC_DATA_PATH + 'papers.json')
methods_df = pd.read_json(PWC_DATA_PATH + 'methods.json')
repos_df = pd.read_json(PWC_DATA_PATH + 'repos.json')
datasets_df = pd.read_json(PWC_DATA_PATH + 'datasets.json')
# evaluation_tables_df = pd.read_json(PWC_DATA_PATH + 'evaluation_tables.json')

areas_df = pd.read_json(PWC_API_PATH + 'areas.json')
print("Loaded JSON files into dataframes")
print("")
print(f"Got {len(papers_df)} papers, {len(methods_df)} methods, {len(repos_df)} repos, {len(datasets_df)} datasets and {len(areas_df)} areas.")

Loading JSON files into dataframes...
Loaded JSON files into dataframes

Got 410364 papers, 2156 methods, 212222 repos, 8822 datasets and 16 areas.


In [10]:
# Do some preprocessing on the dataframes
print("Preprocessing dataframes...")
print("")

papers_df['id'] = papers_df['paper_url'].apply(lambda x: str(int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16)))
methods_df['name'] = methods_df['name'].apply(lambda x: x.lower().replace(" ", "-"))
methods_df['id'] = ("method/" + methods_df['name']).apply(lambda x: str(int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16)))
repos_df['id'] = repos_df['repo_url'].apply(lambda x: str(int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16)))
repos_df['paper_id'] = repos_df['paper_url'].apply(lambda x: str(int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16)))
areas_df['id_md5'] = ("area/" + areas_df['id']).apply(lambda x: str(int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16)))
print(f"[NODES] Got {len(papers_df)} rows in papers_df")
print(f"[NODES] Got {len(methods_df)} rows in methods_df")
print(f"[NODES] Got {len(repos_df)} rows in repos_df")
print(f"[NODES] Got {len(areas_df)} rows in areas_df")

# Make a new dataframe tasks_df with all unique tasks taken from the areas_df
tasks_df = pd.DataFrame(columns=["id", "name", "description", "area_id_md5"])
for index, row in areas_df.iterrows():
    for task in row["tasks"]:
        tasks_df = pd.concat([tasks_df, pd.DataFrame([[task["id"], task["name"], task["description"], row["id_md5"]]], columns=["id", "name", "description", "area_id_md5"])])
tasks_df['id_md5'] = ("task/" + tasks_df['id']).apply(lambda x: str(int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16)))
tasks_df = tasks_df.reset_index(drop=True)
print(f"[NODES] Got {len(tasks_df)} rows in tasks_df")

# Make a new dataframe datasets_df and give each dataset a unique id
datasets_df['id'] = datasets_df['url'].apply(lambda x: str(int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16)))
print(f"[NODES] Got {len(datasets_df)} rows in datasets_df")

# Create a new dataframe datasets_tasks_df and fill it with all dataset-task pairs from the datasets dataframe Keep in mind that the tasks column contains dicts and they first need to be unpacked and only the field "task" has to be taken
datasets_tasks_df = pd.DataFrame(datasets_df[['id', 'tasks']].explode('tasks').dropna())
datasets_tasks_df['task_id'] = datasets_tasks_df['tasks'].apply(lambda x: x['task'])
datasets_tasks_df = datasets_tasks_df.drop(columns=['tasks'])
datasets_tasks_df.columns = ['dataset_id', 'task_id']
# Make the task_id column a string, lowercase and replace spaces with dashes
datasets_tasks_df['task_id'] = datasets_tasks_df['task_id'].apply(lambda x: x.lower().replace(" ", "-"))
# Make a new column task_id_md5 in the datasets_tasks_df
datasets_tasks_df['task_id_md5'] = ("task/" + datasets_tasks_df['task_id']).apply(lambda x: str(int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16)))
# Reset index
datasets_tasks_df = datasets_tasks_df.reset_index(drop=True)
print(f"[EDGES] Got {len(datasets_tasks_df)} rows in datasets_tasks_df")

# Create a new dataframe papers_tasks_df and fill it with all paper-task pairs from the papers dataframe.
papers_tasks_df = pd.DataFrame(papers_df[['id', 'tasks']].explode('tasks').dropna())
papers_tasks_df = papers_tasks_df.reset_index(drop=True)
papers_tasks_df.columns = ['paper_id', 'task_id']

# Make the task_id column a string, lowercase and replace spaces with dashes
papers_tasks_df['task_id'] = papers_tasks_df['task_id'].apply(lambda x: x.lower().replace(" ", "-"))
# Make a new column task_id_md5 in the papers_tasks_df
papers_tasks_df['task_id_md5'] = ("task/" + papers_tasks_df['task_id']).apply(lambda x: str(int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16)))
papers_tasks_df = papers_tasks_df.reset_index(drop=True)
print(f"[EDGES] Got {len(papers_tasks_df)} rows in papers_tasks_df")

papers_methods_df = pd.DataFrame(papers_df[['id', 'methods']].explode('methods').dropna())
papers_methods_df.columns = ['paper_id', 'method_id']
# Replace the dict in method_id only with tne name value
papers_methods_df['method_id'] = papers_methods_df['method_id'].apply(lambda x: x['name'].lower().replace(" ", "-"))
# Make a new column method_id_md5 in the papers_methods_df
papers_methods_df['method_id_md5'] = ("method/" + papers_methods_df['method_id']).apply(lambda x: str(int(hashlib.md5(x.encode('utf-8')).hexdigest(), 16)))
papers_methods_df = papers_methods_df.reset_index(drop=True)
print(f"[EDGES] Got {len(papers_methods_df)} rows in papers_methods_df")

# Make a new dataframe and write only the two columns id and paper_id from the repos_df to it. Then rename the columns "id" and "repo". Call the df papers_repos_df
papers_repos_df = pd.DataFrame(repos_df[['id', 'paper_id']])
papers_repos_df.columns = ['repo_id', 'paper_id']
papers_repos_df = papers_repos_df.reset_index(drop=True)
print(f"[EDGES] Got {len(papers_repos_df)} rows in papers_repos_df")

tasks_areas_df = pd.DataFrame(tasks_df[['id_md5', 'area_id_md5']])
tasks_areas_df.columns = ['id', 'area']
tasks_areas_df = tasks_areas_df.reset_index(drop=True)
print(f"[EDGES] Got {len(tasks_areas_df)} rows in tasks_areas_df"),

print("")
print("Done preprocessing dataframes!")

Preprocessing dataframes...

[NODES] Got 410364 rows in papers_df
[NODES] Got 2156 rows in methods_df
[NODES] Got 212222 rows in repos_df
[NODES] Got 16 rows in areas_df
[NODES] Got 3215 rows in tasks_df
[NODES] Got 8822 rows in datasets_df
[EDGES] Got 15129 rows in datasets_tasks_df
[EDGES] Got 669552 rows in papers_tasks_df
[EDGES] Got 435782 rows in papers_methods_df
[EDGES] Got 212222 rows in papers_repos_df
[EDGES] Got 3215 rows in tasks_areas_df

Done preprocessing dataframes!


In [11]:
# A little preprocessing

# Regex remove all HTML tags, *, double spaces, leading and trailing spaces and newlines ffrom the title, abstracts and descriptions of the papers. If None, do nothing
print("Preprocessing title, abstract and description columns...")
papers_df.title = papers_df.title.apply(lambda x: re.sub('<[^<]+?>|\\*|\n|\s{2,}', ' ', x) if x is not None else None)
papers_df.title = papers_df.title.apply(lambda x: x.strip() if x is not None else None)
papers_df.abstract = papers_df.abstract.apply(lambda x: re.sub('<[^<]+?>|\\*|\n|\s{2,}', ' ', x) if x is not None else None)
papers_df.abstract = papers_df.abstract.apply(lambda x: x.strip() if x is not None else None)
methods_df.description = methods_df.description.apply(lambda x: re.sub('<[^<]+?>|\\*|\n|\s{2,}', ' ', x) if x is not None else None)
methods_df.description = methods_df.description.apply(lambda x: x.strip() if x is not None else None)
datasets_df.description = datasets_df.description.apply(lambda x: re.sub('<[^<]+?>|\\*|\n|\s{2,}', ' ', x) if x is not None else None)
datasets_df.description = datasets_df.description.apply(lambda x: x.strip() if x is not None else None)
tasks_df.description = tasks_df.description.apply(lambda x: re.sub('<[^<]+?>|\\*|\n|\s{2,}', ' ', x) if x is not None else None)
tasks_df.description = tasks_df.description.apply(lambda x: x.strip() if x is not None else None)
print("Done preprocessing title, abstract and description columns!")

Preprocessing title, abstract and description columns...
Done preprocessing title, abstract and description columns!


In [12]:
# Save all dataframes to json files
print("Saving dataframes to json files...")
papers_df.to_json(PWC_PROCESSED_JSON_PATH + "papers.json", orient="records")
methods_df.to_json(PWC_PROCESSED_JSON_PATH + "methods.json", orient="records")
repos_df.to_json(PWC_PROCESSED_JSON_PATH + "repos.json", orient="records")
datasets_df.to_json(PWC_PROCESSED_JSON_PATH + "datasets.json", orient="records")
areas_df.to_json(PWC_PROCESSED_JSON_PATH + "areas.json", orient="records")
tasks_df.to_json(PWC_PROCESSED_JSON_PATH + "tasks.json", orient="records")

datasets_tasks_df.to_json(PWC_PROCESSED_JSON_PATH + "datasets_tasks.json", orient="records")
papers_tasks_df.to_json(PWC_PROCESSED_JSON_PATH + "papers_tasks.json", orient="records")
papers_methods_df.to_json(PWC_PROCESSED_JSON_PATH + "papers_methods.json", orient="records")
papers_repos_df.to_json(PWC_PROCESSED_JSON_PATH + "papers_repos.json", orient="records")
tasks_areas_df.to_json(PWC_PROCESSED_JSON_PATH + "tasks_areas.json", orient="records")
print("Saved dataframes to json files!")

Saving dataframes to json files...
Saved dataframes to json files!


# Compute Time
Now the time has come to enrich your dataset with other information like embeddings, keywords and OpenAlex Info. This step requires a bit more computing power so it should be done on a GPU. We recommend using the *01a_Data_Gathering_Helper.py* script and running it with tmux. It does the following:

- Extract Keywords from Abstracts and Descriptions (using the YAKE framework)
- Calculate Embeddings for Titles, Abstracts and Descriptions (using the Sentence Transformers framework and malteos/SciNCL model)
- Get OpenAlex Info for each paper (using the OpenAlex Postgres Database)
- Get OpenAlex Authors (and edges to papers) for each paper (using the OpenAlex Postgres Database)
- Get OpenAlex Institutions (and edges to authors) for each paper (using the OpenAlex Postgres Database)
- Get OpenAlex Citiation Info (and edges to papers) for each paper (using the OpenAlex Postgres Database)

#### Info:
JSONs and CSVs for Authors, Institutions and their respective edges are saved automatically in the filesystem, they do not have to be reimported here...