In [1]:
import re
import json
import numpy as np
from collections import defaultdict
from flashtext import KeywordProcessor

In [2]:
def find_numbers_with_offsets(text):
    result = []    
    # number
    #numbers = re.finditer(r'\b\d+(?:\.\d+)?\b', text)
    numbers = re.finditer(r'\b(?:\d+(?:\.\d+)?(?:[eE][-+^]?\d+)?|\d+(?:[eE][-+]?\d+))\b', text)
    for match in numbers:
        number = match.group()
        start = match.start()
        end = match.end()
        # next to [ ] or ( ) 
        if '[' in text[start-1:end+1] or '(' in text[start-1:end+1] or ']' in text[start-1:end+1] or ')' in text[start-1:end+1]:
            continue
        # inside [ ] or ( ) 
        if '[' in text[start-10:start] and ']' in text[end:end+10] or '(' in text[start-10:start] and ')' in text[end:end+10]:
            continue
        result.append((('v number', number), start, end))
    
    # range
    number_ranges = re.finditer(r'\[([\d.]+),\s*([\d.]+)\]', text)
    for match in number_ranges:
        start = match.start()
        end = match.end()
        result.append((('v range', match.group()), start, end))

    # set
    number_sets = re.finditer(r'\(([\d\s,]+)\)', text)
    for match in number_sets:
        start = match.start()
        end = match.end()
        numbers = re.findall(r'\d+', match.group(1))
        result.append((('v set', numbers), start, end))
    
    return result

# text axample
text = 'This is a sample [hhh] text deep learning (dl) with numbers 1e-5 like 123, 3.14, and number ranges [0, 1]. Also, it has number sets like (10, 20, 30).'
numbers_with_offsets = find_numbers_with_offsets(text)

for item in numbers_with_offsets:
    print(item)

(('v number', '1e-5'), 60, 64)
(('v number', '123'), 70, 73)
(('v number', '3.14'), 75, 79)
(('v range', '[0, 1]'), 99, 105)
(('v set', ['10', '20', '30']), 137, 149)


In [3]:
with open('ap_rel.json', 'r') as f:
    # pair: (artifact parameter)
    ap_rel = json.load(f)

In [4]:
kp = KeywordProcessor()
for (a,p) in ap_rel:
    kp.add_keyword(a, ('artifact', a))
    kp.add_keyword(p, ('parameter', p))

In [11]:
rawdata_fpath = './transformed_pprs_filtered.jsonl'

def read_jsonl_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            json_obj = json.loads(line.strip())
            data.append(json_obj)
    return data

jsonl_data = read_jsonl_file(rawdata_fpath)
# sample
jsonl_data = jsonl_data[:10]
# jsonl_data

In [12]:
# 匹配并转换 LaTeX 文本中的科学计数法表示
def convert_scientific_notation(match):
    power = int(match.group(1))
    number = 10 ** (power)
    return str(number)


def latex_to_text(latex_string):
    
    # LaTeX command
    text = re.sub(r'\\[a-zA-Z]+\*?', '', latex_string)
    
    text = re.sub(r'10\^\{(-\d+)\}', convert_scientific_notation, text)
    
    # 去除格式标记
    text = re.sub(r'\{[^}]+\}', '', text)
    
    # 去除注释
    #text = re.sub(r'%.*', '', text)
    
    # 去除引用标记
    text = re.sub(r'\[\d+\]', '', text)
    
    # 去除\)
    #text = re.sub(r'\\[()]', '', text)
    text = re.sub(r'[\\/\\()]', '', text)
    
    # space and \n
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

In [18]:
extracted_info = defaultdict(list)
for item in jsonl_data:
    text = item['paragraphs'][0]
    text = latex_to_text(text)
    #print(text)
    sentences = re.split(r'\.\s', text)
    for sentence in sentences:
        entities = kp.extract_keywords(sentence, span_info=True)
        values = find_numbers_with_offsets(sentence)
#         print(entities, values)
        if values and len(entities) > 0:
            for v in values:
                entities.append(v)
        if len(entities) > 1:
#             print(entities,'\n', sentence)
            extracted_info[item['id']].append([entities, sentence]) 
            
        # if (a,p,v), then add
        
            
        # creat entity-relation pair:
        # select the nearest value
        
#         print(v)
        
#         print(x,'\n', sentence)        
#     print(len(sentence))
    
#     x = kp.extract_keywords(text, span_info=True)
#     v = numbers_with_offsets = find_numbers_with_offsets(text)
#     x.append(v)
#     print(x,'\n', text)
    
#     print('\n')
extracted_info

defaultdict(list,
            {'1507.01422': [[[(('parameter', 'dropout'), 17, 24),
                (('parameter', 'layer'), 25, 30),
                (('parameter', 'layer'), 63, 68),
                (('parameter', 'dropout'), 77, 84),
                (('v number', '0.5'), 94, 97),
                (('v number', '50'), 98, 100)],
               'We also tested a dropout layer after the first fully connected layer, with a dropout ratio of 0.5 50% of probability to set a neuron’s output value to zero']],
             '1506.05929': [[[(('parameter', 'learning rate'), 25, 38),
                (('parameter', 'weight decay'), 54, 66),
                (('v number', '0.01'), 89, 93),
                (('v number', '0.9'), 96, 99),
                (('v number', '5'), 106, 107),
                (('v number', '0.0001'), 108, 114)],
               ', with the values of the learning rate, momentum, and weight decay hyperparameters being 0.01 , 0.9 , and 5 0.0001 respectively'],
              [[(('par

In [8]:
len(extracted_info)

1128

In [9]:
# save to JSON file
with open("extracted_info.json", "w") as json_file:
    json.dump(extracted_info, json_file)

In [10]:
for info in extracted_info:
    # how to deal with too much p and v???
    print(extracted_info[info],'\n')

[[[(('parameter', 'dropout'), 17, 24), (('parameter', 'layer'), 25, 30), (('parameter', 'layer'), 63, 68), (('parameter', 'dropout'), 77, 84), (('v number', '0.5'), 94, 97), (('v number', '50'), 98, 100)], 'We also tested a dropout layer after the first fully connected layer, with a dropout ratio of 0.5 50% of probability to set a neuron’s output value to zero']] 

[[[(('parameter', 'learning rate'), 25, 38), (('parameter', 'weight decay'), 54, 66), (('v number', '0.01'), 89, 93), (('v number', '0.9'), 96, 99), (('v number', '5'), 106, 107), (('v number', '0.0001'), 108, 114)], ', with the values of the learning rate, momentum, and weight decay hyperparameters being 0.01 , 0.9 , and 5 0.0001 respectively'], [[(('parameter', 'learning rate'), 64, 77), (('v number', '10'), 104, 106)], 'Whenever the error on the validation set stopped decreasing the learning rate was decreased by a factor 10']] 

[[[(('artifact', 'model'), 4, 9), (('artifact', 'model'), 97, 102), (('v number', '60'), 19, 