# Preprocessing

In [1]:
import json
import os
import csv
from tqdm.notebook import tqdm
from collections import defaultdict

In [2]:
dataset_dir_path = './datasets/dblp-aminer_v12/'
dataset_path = './datasets/dblp-aminer_v12/dblp.v12.json'
N_RECORDS = 4894081

### Step 0 : exploring data

In [12]:
with open(dataset_path) as f:
    f.readline() # skip first line
    for _ in range(2):
        #print(json.dumps(json.loads(f.readline()), indent=2))
        print(f.readline())

{"id":1091,"authors":[{"name":"Makoto Satoh","org":"Shinshu University","id":2312688602},{"name":"Ryo Muramatsu","org":"Shinshu University","id":2482909946},{"name":"Mizue Kayama","org":"Shinshu University","id":2128134587},{"name":"Kazunori Itoh","org":"Shinshu University","id":2101782692},{"name":"Masami Hashimoto","org":"Shinshu University","id":2114054191},{"name":"Makoto Otani","org":"Shinshu University","id":1989208940},{"name":"Michio Shimizu","org":"Nagano Prefectural College","id":2134989941},{"name":"Masahiko Sugimoto","org":"Takushoku University, Hokkaido Junior College","id":2307479915}],"title":"Preliminary Design of a Network Protocol Learning Tool Based on the Comprehension of High School Students: Design by an Empirical Study Using a Simple Mind Map","year":2013,"n_citation":1,"page_start":"89","page_end":"93","doc_type":"Conference","publisher":"Springer, Berlin, Heidelberg","volume":"","issue":"","doi":"10.1007/978-3-642-39476-8_19","references":[2005687710,2018037215

In [16]:
n_errors = 0
n_authors = 0
n_fos = 0
paper_field_count = defaultdict(int)
author_field_count = defaultdict(int)
venue_field_count = defaultdict(int)
fos_field_count = defaultdict(int)

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            try:
                record = json.loads(line)
            except:
                print(line)
                record = {}
                n_errors += 1
                
        if record:
            for field, value in record.items():
                if value:
                    paper_field_count[field] += 1
                    
            if 'authors' in record:
                for author in record['authors']:
                    n_authors += 1
                    for field, value in author.items():
                        if value:
                            author_field_count[field] += 1
                        
            if 'venue' in record:
                for field, value in record['venue'].items():
                    if value:
                            venue_field_count[field] += 1
                        
            if 'fos' in record:
                for fos in record['fos']:
                    n_fos += 1
                    for field, value in fos.items():
                        if value:
                            fos_field_count[field] += 1
            
print(n_errors)
print()
print(json.dumps(paper_field_count, indent=2))
print()
print('n_authors', n_authors)
print(json.dumps(author_field_count, indent=2))
print()
print(json.dumps(venue_field_count, indent=2))
print()
print('n_fos', n_fos)
print(json.dumps(fos_field_count, indent=2))

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))


0

{
  "id": 4894081,
  "authors": 4894063,
  "title": 4894081,
  "year": 4894072,
  "n_citation": 3538030,
  "page_start": 4356226,
  "page_end": 4112329,
  "doc_type": 4394640,
  "publisher": 4134190,
  "doi": 3920939,
  "references": 3777107,
  "indexed_abstract": 4232520,
  "fos": 4877401,
  "venue": 4846973,
  "volume": 2183176,
  "issue": 1618242,
  "alias_ids": 21408
}

n_authors 14934850
{
  "name": 14934850,
  "org": 11361508,
  "id": 14934850
}

{
  "raw": 4820972,
  "id": 4371968,
  "type": 4371970
}

n_fos 45029892
{
  "name": 45029892,
  "w": 41824168
}


In [21]:
paper_ids = set()
author_ids = set()
venue_ids = set()
fos = set()

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
            
        paper_ids.add(record['id'])
        
        if 'authors' in record:
            author_ids.update([author['id'] for author in record['authors']])

        if 'venue' in record:
            if 'id' in record['venue']:
                venue_ids.add(record['venue']['id'])

        if 'fos' in record:
            fos.update([fos['name'] for fos in record['fos']])
        
print('unique paper ids', str(len(paper_ids) - N_RECORDS))
print('nb unique authors', str(len(author_ids)))
print('nb unique venues', str(len(venue_ids)))
print('nb unique fos', str(len(fos)))

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))


unique paper ids 0
nb unique authors 4398138
nb unique venues 10480
nb unique fos 132337


In [22]:
print('venue_ids min-max', str(min(venue_ids)), '-->', str(max(venue_ids)))

venue_ids min-max 182001 --> 2996807011


In [20]:
venue = set()
no_id_no_raw_count = 0

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
            
        paper_ids.add(record['id'])

        if 'venue' in record:
            if 'id' not in record['venue']:
                if 'raw' in record['venue']:
                    venue.add(record['venue']['raw'])
                else:
                    no_id_no_raw_count += 1
        
print('nb unique venues', str(len(venue)))
print('nb records with empty venue data', no_id_no_raw_count)

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))


nb unique venues 38742
nb records with empty venue data 0


### Step 1 : create sub structures

##### Create fos and fos-paper files

In [25]:
fos_paper_dict = defaultdict(set)

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)

        if 'fos' in record:
            for fos in record['fos']:
                fos_paper_dict[fos['name']].add(record['id'])
            
id_fos_dict = dict(enumerate(fos_paper_dict.keys()))

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




In [28]:
with open(dataset_dir_path+'fos.json', 'w') as file:
    json.dump(id_fos_dict, file)

In [29]:
fos_paper_pairs = []

for fos_id, fos_name in tqdm(id_fos_dict.items()):
    for paper_id in fos_paper_dict[fos_name]:
        fos_paper_pairs.append((paper_id, fos_id))

HBox(children=(FloatProgress(value=0.0, max=132337.0), HTML(value='')))




In [31]:
with open(dataset_dir_path+'paper_fos.json', 'w') as file:
    json.dump(fos_paper_pairs, file)

In [32]:
len(fos_paper_pairs)

44987427

In [33]:
len(id_fos_dict)

132337

##### Create author file

In [7]:
author_dict = {}

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
                    
        if 'authors' in record:
            for author in record['authors']:
                if author['id'] not in author_dict:
                    author_dict[author['id']] = author
                elif 'org' in author and 'org' not in author_dict[author['id']]:
                    author_dict[author['id']]['org'] = author['org']
            
len(author_dict)

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




4398138

In [8]:
with open(dataset_dir_path+'author.json', 'w') as file:
    json.dump(author_dict, file)

##### Create author-paper file

In [3]:
author_paper = set()

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
                    
        if 'authors' in record:
            author_paper.update([(record['id'], author['id']) for author in record['authors']])
            
len(author_paper)

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




14934850

In [4]:
with open(dataset_dir_path+'paper_author.json', 'w') as file:
    json.dump(list(author_paper), file)

##### Create venue and paper files

In [3]:
# create venue dicts
venue_with_no_id_dict = {}
venue_dict = {}

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)

        if 'venue' in record:
            if 'id' in record['venue']:
                if record['venue']['id'] not in venue_dict:
                    venue_dict[record['venue']['id']] = record['venue']
            elif (record['venue']['raw']).lower() not in venue_with_no_id_dict:
                venue_with_no_id_dict[(record['venue']['raw']).lower()] = record['venue']
                
print(str(len(venue_dict)), str(len(venue_with_no_id_dict)))

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))


10480 38273


In [4]:
# merge venue dicts
reverse_venue_dict = dict([(venue['raw'].lower(), venue['id']) for venue in venue_dict.values()])

for i, venue in enumerate(venue_with_no_id_dict.values()):
    if venue['raw'].lower() not in reverse_venue_dict:
        venue['id'] = i
        venue_dict[i] = venue
        
reverse_venue_dict = dict([(venue['raw'].lower(), venue['id']) for venue in venue_dict.values()])

In [5]:
len(venue_dict)

48740

In [6]:
with open(dataset_dir_path+'venue.json', 'w') as file:
    json.dump(venue_dict, file)

In [7]:
# create paper file
def reconstruct_abstract(inverted_index):    
    index = {}
    for word, list_of_pos in inverted_index.items():
        for pos in list_of_pos:
            index[pos] = word
    
    abstract_list = []
    for _, word in sorted(index.items(), key=lambda t: t[0]):
        abstract_list.append(word)
    
    return " ".join(abstract_list)

In [9]:
papers = []
n_chunks = 0

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
            
        record.pop('authors', 0)
        record.pop('references', 0)
        record.pop('alias_ids', 0)
        record.pop('fos', 0)
        
        if 'indexed_abstract' in record:
            indexed_abstract = record.pop('indexed_abstract')
            record['abstract'] = reconstruct_abstract(indexed_abstract['InvertedIndex'])

        if 'venue' in record:
            venue = record.pop('venue')
            if 'id' in venue:
                record['venue_id'] = venue['id']
            else:
                record['venue_id'] = reverse_venue_dict[venue['raw'].lower()]
            
        papers.append(record)

        if len(papers) > 1000000:
            with open(dataset_dir_path+'/paper_chunk'+str(n_chunks)+'.json', 'w') as f:
                json.dump(papers, f)
            papers = []
            n_chunks += 1 
    
    with open(dataset_dir_path+'/paper_chunk'+str(n_chunks)+'.json', 'w') as f:
        json.dump(papers, f)

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




In [20]:
with open(dataset_dir_path+'paper.json', 'w') as file:
    json.dump(papers, file)

##### Create paper-paper files

In [3]:
paper_paper_pairs = set()

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
            
        if 'references' in record:
            paper_paper_pairs.update([(record['id'], ref_id) for ref_id in record['references']])

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




In [6]:
with open(dataset_dir_path+'paper_ref.json', 'w') as file:
    json.dump(list(paper_paper_pairs), file)

##### Create paper_ref_list and paper_cit_list files

In [3]:
citation_dict = {}
reference_dict = {}

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
            
        if 'references' in record:
            reference_dict[record['id']] = record['references']
            
            for ref_id in record['references']:
                if ref_id in citation_dict:
                    citation_dict[ref_id].append(record['id'])
                else:
                    citation_dict[ref_id] = [record['id']]

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




In [4]:
paper_paths = [dataset_dir_path + f for f in os.listdir(dataset_dir_path) if 'chunk' in f]

for path in tqdm(paper_paths): 
    with open(path) as file:
        papers = json.load(file)
        
        for paper in tqdm(papers):
            if paper['id'] in citation_dict:
                paper['citation_ids'] = citation_dict[paper['id']]
            if paper['id'] in reference_dict:
                paper['reference_ids'] = reference_dict[paper['id']]
    
    with open(path[:-5]+'a_reformatted.json', 'w') as f:            
        json.dump(papers[:500000], f)
        
    with open(path[:-5]+'b_reformatted.json', 'w') as f:            
        json.dump(papers[500000:], f)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1000001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=894077.0), HTML(value='')))





In [3]:
with open(dataset_dir_path + 'paper_fos.json') as input_file:
    data = json.load(input_file)
    size = len(data) // 2
    
    with open(dataset_dir_path + 'paper_fos_c1.json', 'w') as f:
        json.dump(data[:size], f)
        
    with open(dataset_dir_path + 'paper_fos_c2.json', 'w') as f:
        json.dump(data[size:], f)

In [3]:
with open(dataset_dir_path + 'paper_paper.json') as input_file:
    data = json.load(input_file)
    size = len(data) // 2
    
    with open(dataset_dir_path + 'paper_paper_c1.json', 'w') as f:
        json.dump(data[:size], f)
        
    with open(dataset_dir_path + 'paper_paper_c2.json', 'w') as f:
        json.dump(data[size:], f)