# Pre Processing

In [1]:
import json
import pprint
import csv
import collections
import os

Loads the json from mongo-db onto a list of OrderedDict objects

In [2]:
with open("mongodb-documents.json","r") as f:
    dic = json.load(f)

#pprint.pprint(dic, indent=2) # all different kind of publications are in an array marked by the key

extracts the relations from the json
in particular:
-publications to authors
-publications to keywords
-citations
creates the authors table

In [3]:
#extract all useful relations: authors, editors and ee
def flatten(l):
    if isinstance(l, list):
        return [subelem for elem in l for subelem in flatten(elem)]
    else:
        return [l]

def normalize_item(elem, args):
    ret_val =   tuple(elem[arg] for arg in args)\
                if isinstance(elem, collections.OrderedDict)\
                and set(args).issubset(elem.keys())\
                else tuple(
                    [elem] + [None for arg in range(len(args) - 1)]
                    )
    return ret_val[0] if len(ret_val) == 1 else ret_val

def extract_one_to_many_relation(collection_in_use, first_arg, second_arg, normalize=False, normalizing_params=[]):
    ret_val = [(pub[first_arg], pub[second_arg]) for pub in collection_in_use if second_arg in pub.keys()]
    return [(key, normalize_item(item, normalizing_params) if normalize is True else item)\
            for key,val in ret_val\
            for item in flatten(val)]

pub_to_authors = extract_one_to_many_relation(dic, "title", "authors")
pub_to_keywords = extract_one_to_many_relation(dic, "title", "keywords")
citations = extract_one_to_many_relation(dic, "title", "citations")

temp_authors = list(val[1] for val in pub_to_authors)
authors_names = set()
authors = []
for val in temp_authors:
    if val["name"] not in authors_names:
        authors.append(val)
        authors_names.add(val["name"])
#leave only the name of the authors in the relation as 2nd key
pub_to_authors = list(map(lambda rel: (rel[0], rel[1]["name"]), pub_to_authors))


#pprint.pprint(pub_to_authors)

In [4]:
# casts the values of year to int insted of string
#removes \n charachters from bio

for auth in authors:
    auth["birth_year"] = int(auth["birth_year"]) if "birth_year" in auth.keys() else 1900
    auth["bio"] = auth["bio"].replace("\n"," ") if "bio" in auth.keys() else None
#pprint.pprint(authors, indent=1)

In [5]:
#create publications and authors tables
def make_publication_csv_compliant(publication, args):
    return tuple(publication[arg] if arg in publication.keys() else None for arg in args)
def map_to_csv(collection_in_use, args):
    return list(map(lambda x: make_publication_csv_compliant(x, args), collection_in_use))
pub_params = ["title", "type","publisher","journal", "month", "year", "language", "booktitle", "volume", "pages", "ee"]
pub_csv = map_to_csv(dic, pub_params)

auth_params = ["name", "birth_year", "email", "affiliations", "bio"]
auth_csv = map_to_csv(authors, auth_params)
#pprint.pprint(auth_csv)



In [6]:
#loads everything onto their csvs
CSVs_to_make = [
    ("publications",pub_params, pub_csv),
    ("authors", auth_params, auth_csv),
    ("publications2authors", ["title","auth_name"], pub_to_authors),
    ("citations", ["citing", "cited"], citations),
    ("keywords", ["title", "keyword"], pub_to_keywords)
]
dir_path = os.path.abspath("") + os.sep + "data"
if not os.path.exists(dir_path):
    os.makedirs(dir_path)
    print("directory added")
for table in CSVs_to_make:
    path_of_file = dir_path + os.sep + table[0] + ".csv"
    with open(path_of_file, "w") as f:
        writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
        writer.writerow(table[1])
        writer.writerows(table[2])
    print(f"done with {table[0]}.csv! (num of elements was: {len(table[2])})")
print("done")

done with publications.csv! (num of elements was: 2000)
done with authors.csv! (num of elements was: 1134)
done with publications2authors.csv! (num of elements was: 2406)
done with citations.csv! (num of elements was: 20227)
done with keywords.csv! (num of elements was: 4060)
done
