# Preprocessing

In [1]:
import json
import os
from tqdm.notebook import tqdm
import networkx as nx

In [2]:
dataset_dir_path = './datasets/dblp-aminer_v12/'
dataset_path = './datasets/dblp-aminer_v12/dblp.v12.json'
N_RECORDS = 4894081

In [3]:
def reconstruct_abstract(inverted_index):    
    index = {}
    for word, list_of_pos in inverted_index.items():
        for pos in list_of_pos:
            index[pos] = word
    
    abstract_list = []
    for _, word in sorted(index.items(), key=lambda t: t[0]):
        abstract_list.append(word)
    
    return " ".join(abstract_list)

# Step 1 : keyword-based selection (Deep learning)

In [4]:
selected_records = []

with open(dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
            
        if 'fos' in record:
            for fos in record['fos']:
                if fos['name'].lower() == 'deep learning':
                    selected_records.append(record)
                    break
        
len(selected_records)

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




35878

# Step 2 : remove records with incomplete attributes

In [16]:
ids_to_remove = set()
attributes = ['title', 'authors', 'venue', 'year',
              'references', 'indexed_abstract']

for record in tqdm(selected_records):    
    if not all(attribute in record for attribute in attributes):
        ids_to_remove.add(record['id'])
        
selected_records = [record for record in selected_records if record['id'] not in ids_to_remove]
    
len(selected_records)

HBox(children=(FloatProgress(value=0.0, max=29327.0), HTML(value='')))




29327

In [17]:
ids_to_remove = set()

for record in tqdm(selected_records):
    remove = False
    if len(record['title'].split()) > 100:
        remove = True            
    if 'id' not in record['venue'] or not record['venue']['id']:
        remove = True
    if 'raw' not in record['venue'] or not record['venue']['raw']:
        remove = True
    if 'InvertedIndex' not in record['indexed_abstract']:
        remove = True
    else:
        abstract = reconstruct_abstract(record['indexed_abstract']['InvertedIndex'])
        if len(abstract.split()) > 500:
            remove = True
        if len(abstract.split()) < 20 and abstract[-1] != '.':
            remove = True
    if record['authors']:
        nb_valid_authors = 0
        for author in record['authors']:
            if 'name' in author and author['name'] and 'id' in author and author['id']:
                nb_valid_authors += 1
        if not nb_valid_authors:
            remove = True
    else:
        remove = True

    if remove:
        ids_to_remove.add(record['id'])
        
selected_records = [record for record in selected_records if record['id'] not in ids_to_remove]
    
len(selected_records)

HBox(children=(FloatProgress(value=0.0, max=29327.0), HTML(value='')))




27307

# Step 3 : remove records with no reference and no citation

In [40]:
citations = nx.DiGraph()
record_ids_set = {record['id'] for record in selected_records}

for record in selected_records:
    if 'references' in record:
        for ref_id in record['references']:
            if ref_id in record_ids_set:
                citations.add_edge(record['id'],ref_id)
            
len(restrict_citations)

23089

In [41]:
undirected_citations = citations.to_undirected()
largest_cc = max(nx.connected_components(undirected_citations), key=len)
len(largest_cc)

22726

In [42]:
selected_records = [record for record in selected_records if record['id'] in largest_cc]
    
len(selected_records)

22726

In [43]:
nodes_to_remove = [node for node in citations if node not in largest_cc]
citations.remove_nodes_from(nodes_to_remove)

len(citations)

22726

# Step 4 : format metadata and save subset

In [38]:
formatted_records = []

for record in selected_records:
    formatted_record = {}
    
    formatted_record['id'] = record['id']
    formatted_record['title'] = record['title']
    formatted_record['authors'] = record['authors']
    formatted_record['year'] = record['year']
    formatted_record['venue'] = record['venue']
    formatted_record['fos'] = [fos['name'] for fos in record['fos']]
    formatted_record['abstract'] = reconstruct_abstract(record['indexed_abstract']['InvertedIndex'])
    formatted_record['references'] = list(citations.successors(record['id']))
    formatted_record['citations'] = list(citations.predecessors(record['id']))
    
    formatted_records.append(formatted_record)
    
len(formatted_records)

22726

In [39]:
with open('./datasets/preprocessed_dblp_v12_subset.json', 'w') as file:
    json.dump(formatted_records, file)

# Step 5 : adding fos weights

In [10]:
import json
from tqdm.notebook import tqdm
import collections

In [11]:
complete_dblp_dataset_path = './datasets/dblp-aminer_v12/dblp.v12.json'
N_RECORDS = 4894081

dblp_dataset_path = './datasets/preprocessed_dblp_v12_subset.json'
with open(dblp_dataset_path) as f:
    dblp_dataset = json.load(f)
    
dblp_dataset_dict = {record['id']:record for record in dblp_dataset}

len(dblp_dataset)

22726

In [12]:
ids_set = set()

with open(complete_dblp_dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
            
        if record['id'] in dblp_dataset_dict and 'fos' in record:
            ids_set.add(record['id'])
            dblp_dataset_dict[record['id']]['fos_w'] = record['fos']
            
len(ids_set)

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




22726

In [13]:
nb_prob = 0
for record in dblp_dataset_dict.values():
    fos_list = record['fos']
    fos_w_list = [fos['name'] for fos in record['fos_w']]
    
    if not collections.Counter(fos_list) == collections.Counter(fos_w_list):
        nb_prob += 1  
        
nb_prob

0

In [14]:
with open('./datasets/preprocessed_dblp_v12_subset_with_fos_w.json', 'w') as file:
    json.dump(dblp_dataset, file)

# Step 6 : convert int ids to string

In [3]:
import json

In [4]:
with open('./datasets/preprocessed_dblp_v12_subset_with_fos_w.json') as file:
    papers = json.load(file)
    
    for paper in papers:
        paper['id'] = str(paper['id'])
        paper['citations'] = [str(cit) for cit in paper['citations']]
        paper['references'] = [str(ref) for ref in paper['references']]

In [5]:
with open('./datasets/preprocessed_dblp_v12_subset_with_fos_w.json', 'w') as file:
    json.dump(papers, file)

In [6]:
with open('./datasets/preprocessed_dblp_v12_subset.json') as file:
    papers = json.load(file)
    
    for paper in papers:
        paper['id'] = str(paper['id'])
        paper['citations'] = [str(cit) for cit in paper['citations']]
        paper['references'] = [str(ref) for ref in paper['references']]

In [7]:
with open('./datasets/preprocessed_dblp_v12_subset.json', 'w') as file:
    json.dump(papers, file)