# Preprocessing

Source : Zhang, Y., Ma, J., Wang, Z., Chen, B., & Yu, Y. (2018). Collective topical PageRank: a model to evaluate the topic-dependent academic impact of scientific papers. Scientometrics, 114(3), 1345–1372. https://doi.org/10.1007/s11192-017-2626-1

In [1]:
import json
import os
import csv
from tqdm.notebook import tqdm

In [2]:
dataset_dir_path = './datasets/dblp-aminer'
dataset_path = './datasets/dblp-aminer/dblp_papers_v11.txt'
TOTAL_LEN = 4107340

In [3]:
with open(dataset_path) as f:
    for _ in range(10):
        print(json.dumps(json.loads(f.readline()), indent=2))

{
  "id": "100001334",
  "title": "Ontologies in HYDRA - Middleware for Ambient Intelligent Devices.",
  "authors": [
    {
      "name": "Peter Kostelnik",
      "id": "2702511795"
    },
    {
      "name": "Martin Sarnovsky",
      "id": "2041014688"
    },
    {
      "name": "Jan Hreno",
      "id": "2398560122"
    }
  ],
  "venue": {
    "raw": "AMIF"
  },
  "year": 2009,
  "n_citation": 2,
  "page_start": "43",
  "page_end": "46",
  "doc_type": "",
  "publisher": "",
  "volume": "",
  "issue": "",
  "fos": [
    {
      "name": "Lernaean Hydra",
      "w": 0.4178039
    },
    {
      "name": "Database",
      "w": 0.4269269
    },
    {
      "name": "World Wide Web",
      "w": 0.415332377
    },
    {
      "name": "Ontology (information science)",
      "w": 0.459045082
    },
    {
      "name": "Computer science",
      "w": 0.399807781
    },
    {
      "name": "Middleware",
      "w": 0.5905041
    },
    {
      "name": "Ambient intelligence",
      "w": 0.5440575
   

# Step 1 : remove records with incomplete attributes

In [4]:
paper_ids = set()
paper_references = {}

with open(dataset_path) as file:
    for line in tqdm(file, total=TOTAL_LEN):
        json_record = json.loads(line)
        attributes = ['id', 'title', 'authors', 'venue', 'year',
                      'references', 'indexed_abstract']
        if all(attribute in json_record for attribute in attributes):
            paper_ids.add(json_record['id'])
            paper_references[json_record['id']] = json_record['references']

HBox(children=(FloatProgress(value=0.0, max=4107340.0), HTML(value='')))




In [5]:
len(paper_ids)

3260613

In [6]:
def reconstruct_abstract(inverted_index):    
    index = {}
    for word, list_of_pos in inverted_index.items():
        for pos in list_of_pos:
            index[pos] = word
    
    abstract_list = []
    for _, word in sorted(index.items(), key=lambda t: t[0]):
        abstract_list.append(word)
    
    return " ".join(abstract_list)

In [8]:
paper_ids_to_remove = set()

with open(dataset_path) as file:
    for line in tqdm(file, total=TOTAL_LEN):
        json_record = json.loads(line)
        if json_record['id'] in paper_ids:
            remove = False
            if len(json_record['title'].split()) > 100:
                remove = True            
            if 'id' not in json_record['venue'] or  not json_record['venue']['id']:
                remove = True
            if 'raw' not in json_record['venue'] or not json_record['venue']['raw']:
                remove = True
            if 'InvertedIndex' not in json_record['indexed_abstract']:
                remove = True
            else:
                abstract = reconstruct_abstract(json_record['indexed_abstract']['InvertedIndex'])
                if len(abstract.split()) > 500:
                    remove = True
                if len(abstract.split()) < 20 and abstract[-1] != '.':
                    remove = True
            if json_record['authors']:
                nb_valid_authors = 0
                for author in json_record['authors']:
                    if 'name' in author and author['name'] and 'id' in author and author['id']:
                        nb_valid_authors += 1
                if not nb_valid_authors:
                    remove = True
            else:
                remove = True
                
            if remove:
                paper_ids_to_remove.add(json_record['id'])

HBox(children=(FloatProgress(value=0.0, max=4107340.0), HTML(value='')))




In [9]:
len(paper_ids_to_remove)

183754

In [10]:
paper_ids = paper_ids.difference(paper_ids_to_remove)

In [11]:
len(paper_ids)

3076859

# Step 2 : remove records with no reference and no citation

2 steps : checking for references not in the dataset, then remove records with no references and no citations

In [12]:
paper_citations = {}

for paper_id, references in paper_references.items():
    for ref_id in references:
        if ref_id in paper_citations:
            paper_citations[ref_id].append(paper_id)
        else:
            paper_citations[ref_id] = [paper_id]

In [13]:
len_before = 0
len_after = 1
paper_to_remove_ids = set()

while len_before < len_after:
    len_before = len(paper_to_remove_ids)
    for paper_id in tqdm(paper_ids):
        if paper_id in paper_references:
            references = len([paper_id for paper_id in paper_references[paper_id]
                             if paper_id in paper_ids and paper_id not in paper_to_remove_ids])
        else:
            references = 0
        
        if paper_id in paper_citations:
            citations = len([paper_id for paper_id in paper_citations[paper_id]
                            if paper_id in paper_ids and paper_id not in paper_to_remove_ids])
        else:
            citations = 0
        
        if not references and not citations:
            paper_to_remove_ids.add(paper_id)
                    
    len_after = len(paper_to_remove_ids)
    print("New removed papers:",str(len_after - len_before))

HBox(children=(FloatProgress(value=0.0, max=3076859.0), HTML(value='')))


New removed papers: 131829


HBox(children=(FloatProgress(value=0.0, max=3076859.0), HTML(value='')))


New removed papers: 0


In [14]:
len(paper_to_remove_ids)

131829

In [15]:
paper_ids = paper_ids.difference(paper_to_remove_ids)

In [16]:
len(paper_ids)

2945030

# Step 3 : adjust names of venues

In [17]:
venue_set = set()

with open(dataset_path) as file:
    for line in tqdm(file, total=TOTAL_LEN):
        json_record = json.loads(line)
        if json_record['id'] in paper_ids:
            venue_set.add((json_record['venue']['id'], json_record['venue']['raw']))

HBox(children=(FloatProgress(value=0.0, max=4107340.0), HTML(value='')))




In [18]:
len(venue_set)

7738

In [19]:
with open(dataset_dir_path + '/' + 'venue_names.txt', 'w', encoding='utf-8') as f:
    for venue in venue_set:
        print(venue[0], '|', venue[1], file=f)

In [None]:
#TODO correct venue names --> no need

# Step 4 : split dataset in chunk files

In [20]:
records = []

with open(dataset_path) as file:
    chunk_nb = 0
    for line in tqdm(file, total=TOTAL_LEN):
        old_record = json.loads(line)
        if old_record['id'] in paper_ids:
            new_record = {}
            new_record['id'] = old_record['id']
            new_record['title'] = old_record['title']
            new_record['authors'] = old_record['authors']
            new_record['year'] = old_record['year']
            new_record['venue'] = old_record['venue']
            
            new_record['abstract'] = reconstruct_abstract(old_record['indexed_abstract']['InvertedIndex'])
            
            if old_record['id'] in paper_references: 
                references = [paper_id for paper_id in paper_references[old_record['id']] if paper_id in paper_ids]
                if references:
                    new_record['references'] = references
            
            if old_record['id'] in paper_citations:
                citations = [paper_id for paper_id in paper_citations[old_record['id']] if paper_id in paper_ids]
                if citations:
                    new_record['citations'] = citations
            
            records.append(new_record)
        
        if len(records) > 100000:
            with open(dataset_dir_path+'/prepro_dblp_chunk'+str(chunk_nb)+'.json', 'w') as f:
                json.dump(records, f)
            records = []
            chunk_nb += 1
    
    if records:
        with open(dataset_dir_path+'/prepro_dblp_chunk'+str(chunk_nb)+'.json', 'w') as f:
            json.dump(records, f)

HBox(children=(FloatProgress(value=0.0, max=4107340.0), HTML(value='')))




In [21]:
size = 0
chunkfiles = [f for f in os.listdir(dataset_dir_path) if 'chunk' in f]

for filename in tqdm(chunkfiles):
    with open(dataset_dir_path + '/' + filename) as file:
        size += len(json.load(file))
            
size

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




2945030

In [22]:
len(paper_ids)

2945030