In [1]:
import os
import re
import regex
import json
import random
from ftfy import fix_text

import nltk
nltk.download('words')
from nltk.corpus import words

from tqdm.notebook import tqdm

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\jeman\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
paper_metadata_file_path = './datasets/aan/release/2014/acl-metadata.txt'
paper_citation_file_path = './datasets/aan/release/2014/acl.txt'
paper_text_dir_path = './datasets/aan/papers_text'

# Extract citations

In [3]:
paper_citation_file = open(paper_citation_file_path)
references = {}
citations = {}

for line in paper_citation_file:
    source, target = line.split(' ==> ')
    target = target[:-1] # remove '\n'
    
    if source in references:
        references[source].append(target)
    else:
        references[source] = [target]
        
    if target in citations:
        citations[target].append(source)
    else:
        citations[target] = [source]

paper_citation_file.close()

# Extract metadata

In [4]:
# source : https://github.com/chbrown/acl-anthology-network
paper_metadata_file = open(paper_metadata_file_path, 'rb')

lines = []

for bline in paper_metadata_file:
    try:
        line = bline.decode('UTF-8')
        line = fix_text(line)
        lines.append(line)
    except:
        try:
            line = bline.decode('ISO-8859-2')
            line = fix_text(line)
            lines.append(line)
        except UnicodeDecodeError as e:
            print(e)

paper_metadata_file.close()

In [5]:
paper_metadata = {}
current_paper_id = ''

# tags = {id, author, title, venue, year}

for line in lines:
    result = re.match('(\w+)\s+=\s+\{\s*(.*?)\s*\}', line)
    if result:
        tag = result[1]
        value = result[2]
        
        if tag == 'id':
            current_paper_id = value
            paper_metadata[current_paper_id] = {}
        
        paper_metadata[current_paper_id][tag] = value

# Extract abstract

In [6]:
# file extensions = {body, cite, ref, tmp, txt}, only need txt
textfiles = [f for f in os.listdir(paper_text_dir_path) if '.txt' in f]

paper_texts = {}

for filename in tqdm(textfiles):
    paper_id = filename.split('.')[0]
    path = paper_text_dir_path + '/' + filename
    
    with open(path) as file:
        text = ''
        abstract_len = 0
        begin_abstract = False
        end_abstract = False
        for line in file:
            if begin_abstract:
                if regex.search('(introduction){e<=3}\s*$', line.lower()):
                    begin_abstract = False
                    end_abstract = True
                    break
                elif regex.search('^\s*[0|1]', line):
                    begin_abstract = False
                    end_abstract = True
                    break
                elif regex.search('(biographies){e<=3}\s*$', line.lower()):
                    begin_abstract = False
                    end_abstract = True
                    break
                elif regex.search('(1\.?(introduction){e<=3})', line.lower()):
                    str_len = len(regex.search('^(.+)(1\.?(introduction){e<=3})', line.lower())[1])
                    text += line[:str_len]
                    abstract_len += 1
                    begin_abstract = False
                    end_abstract = True
                    break
                else:
                    text += line[:-1]
                    abstract_len += 1
            elif regex.search('(abstract){e<=1}\s*$', line.lower()):
                begin_abstract = True
                
    if text and end_abstract:
        paper_texts[paper_id] = text

len(paper_texts)

HBox(children=(FloatProgress(value=0.0, max=23595.0), HTML(value='')))




19197

In [8]:
# reject abstracts which contain mail or phone number
rejected_abstracts = []

for key, value in paper_texts.items():
    if re.search('[\w|\-]+@[\w|\-]+\.[a-zA-Z]+|[0-9]+[\-\s]+[0-9]+[\-\s]+[0-9]+', value):
        rejected_abstracts.append(key)
        
for key in rejected_abstracts:
    paper_texts.pop(key)
    
len(paper_texts)

19118

In [9]:
for key, value in tqdm(paper_texts.items()):
    corrected_value = value.strip()
    
    p = re.compile('[\s|\t]+')
    corrected_value = p.sub(' ', corrected_value)
    
    p = re.compile('\s*\-\s*')
    corrected_value = p.sub('-', corrected_value)
    
    for match in re.findall('(([a-zA-Z]+)\-([a-zA-Z]+))', corrected_value):
        if match[1] + match[2] in words.words():
            corrected_value = corrected_value.replace(match[0],
                                                      match[1] + match[2])
    
    # similarly removing spaces between words and checking if english word
    # to correct parsing errors is tempting but it generates some errors
    # and is very long to process
    # examples : 'to a' --> 'toa', 'and a' --> 'anda', 'a key' --> 'akey'
    
    paper_texts[key] = corrected_value

HBox(children=(FloatProgress(value=0.0, max=19118.0), HTML(value='')))




In [10]:
paper_texts['A00-1012']

'This paper explores the problem of identifying sen- tence boundaries in the transcriptions produced by automatic speech recognition systems. An experi- ment which determines the level of human perform- ance for this task is described as well as a memory- based computational pproach to the problem. '

In [10]:
paper_texts['A00-1012']

'This paper explores the problem of identifying sentence boundaries in the transcriptions produced by automatic speech recognition systems. An experiment which determines the level of human performance for this task is described as well as a memory-based computational pproach to the problem.'

In [11]:
paper_texts['A00-1019']

'This work is in the context of TRANSTYPE, a sys- tem that observes its user as he or she types a trans- lation and repeatedly suggests completions for the text already entered. The user may either accept, modify, or ignore these suggestions. We describe the design, implementation, and performance of a pro- totype which suggests completions of units of texts that are longer than one word. '

In [11]:
paper_texts['A00-1019']

'This work is in the context of TRANSTYPE, a system that observes its user as he or she types a translation and repeatedly suggests completions for the text already entered. The user may either accept, modify, or ignore these suggestions. We describe the design, implementation, and performance of a prototype which suggests completions of units of texts that are longer than one word.'

In [12]:
paper_texts['A92-1019']

'Hans  Pau lussen  Facult6s Universitaires Notre-Dame de la Paix, rue de Bruxelles 61, B-5000 Namur, Belgium phone +32-81-72.41.37, fax +32-81-23.03.91, e-mail hpaulus@cc.fundp.ac.be Wi l ly  Mar t in  Vrije Universiteit, De Boelelaan 1105, NL- 1007 MC Amsterdam, The Netherlands phone +31-20-548.37.63, fax +31-20-661.30.54, e-mail lexico@let.vu.nl '

In [12]:
paper_texts['A92-1019']

KeyError: 'A92-1019'

# Agregate

In [13]:
for paper_id in paper_metadata.keys():
    if paper_id in citations:
        paper_metadata[paper_id]['citations'] = citations[paper_id]
    else:
        paper_metadata[paper_id]['citations'] = []
    
    if paper_id in references:
        paper_metadata[paper_id]['references'] = references[paper_id]
    else:
        paper_metadata[paper_id]['references'] = []
        
    if paper_id in paper_texts:
        paper_metadata[paper_id]['abstract'] = paper_texts[paper_id]
    else:
        paper_metadata[paper_id]['abstract'] = ''

# Filter paper without complete metadata

In [17]:
# tags = {id, author, title, venue, year}
filtered_papers = []
filtered_ids = set()

for paper in tqdm(paper_metadata.values()):
    ok = True
    
    if 'id' not in paper or not paper['id']:
        ok = False
        
    if 'author' not in paper or not paper['author']:
        ok = False
    else:
        paper['author'] = [aut.strip() for aut in paper['author']]
        
    if 'title' not in paper or not paper['title']:
        ok = False
    else:
        paper['title'] = paper['title'].strip()
    
    if 'venue' not in paper or not paper['venue']:
        paper['venue'] = ''
    else:
        paper['venue'] = paper['venue'].strip()
        
    if 'year' not in paper or not paper['year'] or not paper['year'].isdigit():
        ok = False
    else:
        paper['year'] = int(paper['year'])
        
    if not paper['abstract']:
        ok = False
    else:
        paper['abstract'] = paper['abstract'].strip()
        
    if not paper['citations'] and not paper['references']:
        ok = False
        
    if ok:
        filtered_papers.append(paper)
        filtered_ids.add(paper['id'])
        
len(filtered_papers)

HBox(children=(FloatProgress(value=0.0, max=23775.0), HTML(value='')))




16391

In [18]:
for paper in  filtered_papers:
    paper['references'] = [paper_id for paper_id in paper['references'] if paper_id in filtered_ids]
    paper['citations'] = [paper_id for paper_id in paper['citations'] if paper_id in filtered_ids]

# Export to json

In [19]:
with open('./datasets/preprocessed_aan.json', 'w') as f:
    json.dump(filtered_papers, f)

# Prepro part 2

New problems identified during data analyzing (dist plot of features):
- too long / too short abstracts
- no link (in/out citation)

In [1]:
import json

In [2]:
aan_dataset_path = "./datasets/preprocessed_aan.json"
with open(aan_dataset_path) as f:
    aan_dataset = json.load(f)
len(aan_dataset)

16391

## Too short abstract

In [3]:
cleaned_dataset = []

for ref in aan_dataset:
    if not (len(ref['abstract'].split()) < 20 and ref['abstract'][-1] != '.'):
        cleaned_dataset.append(ref)
        
aan_dataset = cleaned_dataset

In [4]:
len(cleaned_dataset)

16318

## Too long abstract

In [5]:
cleaned_dataset = []

for ref in aan_dataset:
    if len(ref['abstract'].split()) < 500:
        cleaned_dataset.append(ref)
        
aan_dataset = cleaned_dataset

In [6]:
len(cleaned_dataset)

16134

## Refs without in/out citation

In [7]:
def recompute_links(references):
    ids = set([ref['id'] for ref in references])
    
    recomputed_refs = []
    
    for ref in references:
        ref['references'] = [i for i in ref['references'] if i in ids]
        ref['citations'] = [i for i in ref['citations'] if i in ids]
        recomputed_refs.append(ref)
        
    return recomputed_refs

In [8]:
def remove_no_link(references):
    cleaned_dataset = []
    
    for ref in references:
        if ref['references'] or ref['citations']:
            cleaned_dataset.append(ref)
    
    print('remove_no_link -->',str(len(cleaned_dataset)-len(references)),'removed')
    
    return cleaned_dataset

In [9]:
old_len = len(aan_dataset)
cleaned_dataset = remove_no_link(recompute_links(aan_dataset))
cleaned_len = len(cleaned_dataset)

while old_len > cleaned_len:
    old_len = cleaned_len
    cleaned_dataset = remove_no_link(recompute_links(cleaned_dataset))
    cleaned_len = len(cleaned_dataset)
    
aan_dataset = cleaned_dataset

remove_no_link --> -532 removed
remove_no_link --> 0 removed


In [10]:
len(cleaned_dataset)

15602

## Correct author list

In [11]:
for ref in aan_dataset:
    ref['author'] = "".join(ref['author']).split(';')

## Export to json 2

In [12]:
with open('./datasets/preprocessed2_aan.json', 'w') as f:
    json.dump(aan_dataset, f)

# Prepro part 3

Only keeping the big connected component

In [2]:
import json
import networkx as nx

In [3]:
aan_dataset_path = "./datasets/preprocessed2_aan.json"
with open(aan_dataset_path) as f:
    aan_dataset = json.load(f)
len(aan_dataset)

15602

In [5]:
G = nx.Graph()
for paper in aan_dataset:
    for ref_id in paper['references']:
        G.add_edge(paper['id'], ref_id)
    for cit_id in paper['citations']:
        G.add_edge(cit_id, paper['id'])
        
len(G)

15602

In [14]:
biggest_cc = max(nx.connected_components(G), key=len)
prepro_aan_dataset = [ref for ref in aan_dataset if ref['id'] in biggest_cc]

15366

In [18]:
with open('./datasets/preprocessed3_aan.json', 'w') as f:
    json.dump(prepro_aan_dataset, f)

In [21]:
G = nx.Graph()
for paper in aan_dataset:
    for ref_id in paper['references']:
        G.add_edge(paper['id'], ref_id)
    for cit_id in paper['citations']:
        G.add_edge(cit_id, paper['id'])
        
len(G)

15366

# Prepro part 4

Add fos from DBLP v12

In [29]:
import json
from tqdm.notebook import tqdm
import networkx as nx

In [30]:
dblp_dataset_path = './datasets/dblp-aminer_v12/dblp.v12.json'
N_RECORDS = 4894081

aan_dataset_path = "./datasets/preprocessed3_aan.json"
with open(aan_dataset_path) as f:
    aan_dataset = json.load(f)
    
aan_dataset_dict = {record['id']: record for record in aan_dataset}

len(aan_dataset)

15366

In [31]:
ids_set = set()
titles = {record['title'].lower() for record in aan_dataset}
title_id_dict = {record['title'].lower():record['id'] for record in aan_dataset}

with open(dblp_dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
            
        if record['title'].lower() in titles and 'fos' in record:
            paper_id = title_id_dict[record['title'].lower()]
            ids_set.add(paper_id)
            aan_dataset_dict[paper_id]['fos'] = [fos['name'] for fos in record['fos']]
            
len(ids_set)

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




12636

In [32]:
citations = nx.DiGraph()

for record in aan_dataset:
    if record['id'] in ids_set and 'references' in record:
        for ref_id in record['references']:
            if ref_id in ids_set:
                citations.add_edge(record['id'],ref_id)
            
len(citations)

12339

In [33]:
undirected_citations = citations.to_undirected()
largest_cc = max(nx.connected_components(undirected_citations), key=len)
len(largest_cc)

12274

In [34]:
nodes_to_remove = [node for node in citations if node not in largest_cc]
citations.remove_nodes_from(nodes_to_remove)

len(citations)

12274

In [36]:
selected_records = [record for record in aan_dataset_dict.values() if record['id'] in largest_cc]
    
len(selected_records)

12274

In [37]:
for record in selected_records:
    record['references'] = list(citations.successors(record['id']))
    record['citations'] = list(citations.predecessors(record['id']))

In [39]:
with open('./datasets/preprocessed4_aan.json', 'w') as file:
    json.dump(selected_records, file)

# Prepro part 5

Add fos weights from DBLP v12

In [13]:
import json
from tqdm.notebook import tqdm
import collections

In [3]:
dblp_dataset_path = './datasets/dblp-aminer_v12/dblp.v12.json'
N_RECORDS = 4894081

aan_dataset_path = "./datasets/preprocessed4_aan.json"
with open(aan_dataset_path) as f:
    aan_dataset = json.load(f)
    
aan_dataset_dict = {record['title'].lower():record for record in aan_dataset}

len(aan_dataset)

12274

In [7]:
titles_set = set()

with open(dblp_dataset_path) as file:
    file.readline() # skip first line
    for _ in tqdm(range(N_RECORDS)):
        line = file.readline()
        try:
            record = json.loads(line[1:])
        except:
            record = json.loads(line)
            
        if record['title'].lower() in aan_dataset_dict and 'fos' in record:
            titles_set.add(record['title'].lower())
            aan_dataset_dict[record['title'].lower()]['fos_w'] = record['fos']
            
len(titles_set)

HBox(children=(FloatProgress(value=0.0, max=4894081.0), HTML(value='')))




12274

In [14]:
nb_prob = 0
for record in aan_dataset_dict.values():
    fos_list = record['fos']
    fos_w_list = [fos['name'] for fos in record['fos_w']]
    
    if not collections.Counter(fos_list) == collections.Counter(fos_w_list):
        nb_prob += 1  
        
nb_prob

0

In [17]:
with open('./datasets/preprocessed4_aan_with_fos_w.json', 'w') as file:
    json.dump(aan_dataset, file)