In [78]:
import json
import pickle
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True) 
import itertools
import re
import requests
import sys
import pprint

In [62]:
data_file = 'arxiv-metadata-oai-snapshot.json'       #'./Parec/elasticsearch/data/arxiv_reduced.json'    #'./data/example_snapshot.json'        

In [35]:
# Filter for categories
# see https://arxiv.org/help/api/user-manual --> only keep categories related to Computer Science

category_map = {
'cs.AI': 'Artificial Intelligence',
'cs.AR': 'Hardware Architecture',
'cs.CC': 'Computational Complexity',
'cs.CE': 'Computational Engineering, Finance, and Science',
'cs.CG': 'Computational Geometry',
'cs.CL': 'Computation and Language',
'cs.CR': 'Cryptography and Security',
'cs.CV': 'Computer Vision and Pattern Recognition',
'cs.CY': 'Computers and Society',
'cs.DB': 'Databases',
'cs.DC': 'Distributed, Parallel, and Cluster Computing',
'cs.DL': 'Digital Libraries',
'cs.DM': 'Discrete Mathematics',
'cs.DS': 'Data Structures and Algorithms',
'cs.ET': 'Emerging Technologies',
'cs.FL': 'Formal Languages and Automata Theory',
'cs.GL': 'General Literature',
'cs.GR': 'Graphics',
'cs.GT': 'Computer Science and Game Theory',
'cs.HC': 'Human-Computer Interaction',
'cs.IR': 'Information Retrieval',
'cs.IT': 'Information Theory',
'cs.LG': 'Machine Learning',
'cs.LO': 'Logic in Computer Science',
'cs.MA': 'Multiagent Systems',
'cs.MM': 'Multimedia',
'cs.MS': 'Mathematical Software',
'cs.NA': 'Numerical Analysis',
'cs.NE': 'Neural and Evolutionary Computing',
'cs.NI': 'Networking and Internet Architecture',
'cs.OH': 'Other Computer Science',
'cs.OS': 'Operating Systems',
'cs.PF': 'Performance',
'cs.PL': 'Programming Languages',
'cs.RO': 'Robotics',
'cs.SC': 'Symbolic Computation',
'cs.SD': 'Sound',
'cs.SE': 'Software Engineering',
'cs.SI': 'Social and Information Networks',
'cs.SY': 'Systems and Control'}

In [None]:
# with open("cs_categories.json", "w") as outfile:      #save categories as json
#     json.dump(category_map, outfile)

In [66]:
def get_metadata(path_to_dataset):
    with open(path_to_dataset, 'r', encoding='latin_1') as f:     #load original data set
        for line in f:
            yield line

# Strip trailing whitespaces and \n-characters
def clean_strings(strings):
    cleaned = strings.strip() 
    return re.sub('\s+',' ', cleaned)

In [69]:
def filter_dataset(path_to_dataset, category_map):

    authors = []
    titles = []
    abstracts = []
    years = []
    categories = []
    metadata = get_metadata(path_to_dataset)

    for paper in metadata:
        paper_dict = json.loads(paper, encoding='latin_1')
        ref = paper_dict.get('journal-ref')
        try:
            year = int(ref[-4:]) 
            if 2016 < year <= 2022:
                categories.append(category_map[paper_dict.get('categories').split(" ")[0]])
                authors.append(paper_dict.get('authors'))
                years.append(year)
                titles.append(paper_dict.get('title'))
                abstracts.append(paper_dict.get('abstract'))
        except:
            pass 
    #print("Check length: ", len(titles), len(abstracts), len(years), len(authors), len(categories))

    cleaned_abstracts = [clean_strings(abstract) for abstract in abstracts]
    cleaned_titles = [clean_strings(title) for title in titles]
    cleaned_authors = [clean_strings(author) for author in authors]

    reduced = []
    for author, title, abstract, year, category in zip(cleaned_authors, cleaned_titles, cleaned_abstracts, years, categories):
        reduced.append({"abstract":abstract, "title":title, "author":author, "year":year, "category":category})
    
    return {"root": reduced}        #add root

In [70]:
data = filter_dataset(data_file, category_map)


'encoding' is ignored and deprecated. It will be removed in Python 3.9



In [76]:
print(data["root"][0:1])

[{'abstract': 'In Lombardi drawings of graphs, edges are represented as circular arcs, and the edges incident on vertices have perfect angular resolution. However, not every graph has a Lombardi drawing, and not every planar graph has a planar Lombardi drawing. We introduce k-Lombardi drawings, in which each edge may be drawn with k circular arcs, noting that every graph has a smooth 2-Lombardi drawing. We show that every planar graph has a smooth planar 3-Lombardi drawing and further investigate topics connecting planarity and Lombardi drawings.', 'title': 'Planar and Poly-Arc Lombardi Drawings', 'author': 'Christian A. Duncan and David Eppstein and Michael T. Goodrich and Stephen G. Kobourov and Maarten L\\"offler', 'year': 2018, 'category': 'Computational Geometry'}]


In [72]:
with open('arxiv_reduced.json', 'w', encoding='latin_1') as fp:    #save reduced data set
    json.dump(data, fp)

In [73]:
f = open('arxiv_reduced.json', encoding='latin_1')
data = json.load(f, encoding='latin_1')

In [74]:
print(data["root"][:1])     #test if same

[{'abstract': 'In Lombardi drawings of graphs, edges are represented as circular arcs, and the edges incident on vertices have perfect angular resolution. However, not every graph has a Lombardi drawing, and not every planar graph has a planar Lombardi drawing. We introduce k-Lombardi drawings, in which each edge may be drawn with k circular arcs, noting that every graph has a smooth 2-Lombardi drawing. We show that every planar graph has a smooth planar 3-Lombardi drawing and further investigate topics connecting planarity and Lombardi drawings.', 'title': 'Planar and Poly-Arc Lombardi Drawings', 'author': 'Christian A. Duncan and David Eppstein and Michael T. Goodrich and Stephen G. Kobourov and Maarten L\\"offler', 'year': 2018, 'category': 'Computational Geometry'}]
