In [1]:
import json

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return data

def save_as_jsonl(data_list, file_path):
    """
    Save a list of dictionaries as a jsonl file.
    
    :param data_list: List of dictionaries.
    :param file_path: Path to the jsonl file.
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        for entry in data_list:
            json_str = json.dumps(entry, ensure_ascii=False)
            file.write(json_str + '\n')


In [2]:
kp20k_train = read_jsonl('kp20k/train.json')
kptimes_train = read_jsonl('kptimes/train.json')
stackexchange_train = read_jsonl('stackexchange/train.json')

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def meng17_tokenize(text):
    '''
    The tokenizer used in Meng et al. ACL 2017
    parse the feed-in text, filtering and tokenization
    keep [_<>,\(\)\.\'%], replace digits with <digit>, split by [^a-zA-Z0-9_<>,\(\)\.\'%]
    :param text:
    :return: a list of tokens
    '''
    # remove line breakers
    text = re.sub(r'[\r\n\t]', ' ', text)
    # pad spaces to the left and right of special punctuations
    text = re.sub(r'[_<>,\(\)\.\'%]', ' \g<0> ', text)
    # tokenize by non-letters (new-added + # & *, but don't pad spaces, to make them as one whole word)
    tokens = list(filter(lambda w: len(w) > 0, re.split(r'[^a-zA-Z0-9_<>,#&\+\*\(\)\.\']', text)))

    return tokens


def extract_candidates(text):

    GRAMMAR_EN = """  NP:
{<NN.*|JJ>*<NN.*>}"""   # Adjective(s)(optional) + Noun(s)
    keyphrase_candidate = set()
    

    np_parser = nltk.RegexpParser(GRAMMAR_EN)  # Noun phrase parser
    
    tag = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))

    
    trees = np_parser.parse_sents(tag)  # Generator with one tree per sentence
    #print(text)

    for tree in trees:
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):  # For each nounphrase
            # Concatenate the token with a space
            keyphrase_candidate.add(' '.join(word for word, tag in subtree.leaves()))
    
    #print(keyphrase_candidate)
    keyphrase_candidate = {kp for kp in keyphrase_candidate if len(kp.split()) <= 4}
    #print(keyphrase_candidate)
  
    return list(keyphrase_candidate)


def title_candidates_extraction(title, text):
    
    cans = extract_candidates(title)
    candidates = []
    for can in cans:
        candidates.append(can.lower())

    candidates = list(set(candidates))
    
    present_phrases = []
    absent_phrases = []
    text_low = text.lower()
    tokenized_text = meng17_tokenize(text_low)
    stem_text = [ stemmer.stem(word) for word in tokenized_text ]
    stem_text = ' '.join(stem_text)

    # stem_text = ' '.join(meng17_tokenize(text_low))
    # print(stem_text)


    for p in candidates:
        tokenized_p = meng17_tokenize(p.lower())
        stem_p = [ stemmer.stem(word) for word in tokenized_p ]
        stem_p = ' '.join(stem_p)
        # print(stem_p)
    
        if stem_p not in stem_text:
            absent_phrases.append(p)
        else:
            present_phrases.append(p)

    return present_phrases, absent_phrases


def preprocess(dataset, name='kp20k'):

    processed_dataset = []

    for data in tqdm(dataset):

        temp = {}
        if name == 'kp20k':
            temp['title'] = data['title']
            title_present_phrase, title_absent_phrase = title_candidates_extraction(data['title'], data['abstract'])
            temp['abstract'] = data['abstract']
            temp['title_present_phrase'] = title_present_phrase
            temp['title_absent_phrase'] = title_absent_phrase
            temp['keyphrases'] = data['keywords']
        elif name == 'kptimes':
            temp['title'] = data['title']
            title_present_phrase, title_absent_phrase = title_candidates_extraction(data['title'], data['abstract'])
            temp['abstract'] =data['abstract']
            temp['title_present_phrase'] = title_present_phrase
            temp['title_absent_phrase'] = title_absent_phrase
            temp['keyphrases'] = data['keyword'].split(';')
        elif name =='stackexchange':
            temp['title'] = data['title']
            title_present_phrase, title_absent_phrase = title_candidates_extraction(data['title'], data['question'])
            temp['abstract'] = data['question']
            temp['title_present_phrase'] = title_present_phrase
            temp['title_absent_phrase'] = title_absent_phrase
            temp['keyphrases'] = data['tags'].split(';')
        processed_dataset.append(temp)

    return processed_dataset

[nltk_data] Downloading package punkt to /home/user01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/user01/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
kp20k = preprocess(kp20k_train, 'kp20k')
kptimes = preprocess(kptimes_train, 'kptimes')
stackexchange = preprocess(stackexchange_train, 'stackexchange')

  0%|          | 0/514154 [00:00<?, ?it/s]

100%|██████████| 514154/514154 [29:51<00:00, 286.99it/s] 
100%|██████████| 259923/259923 [57:26<00:00, 75.41it/s]  
100%|██████████| 298965/298965 [19:25<00:00, 256.43it/s] 


In [5]:
def aggregate_data(data_list):
    """
    Calculate and aggregate the statistics from the given data list.
    
    :param data_list: List of dictionaries with the given structure.
    :return: A dictionary with the aggregated statistics.
    """
    
    total_title_absent = 0
    total_title_present = 0
    count_title_absent_1 = 0
    count_title_absent_2 = 0
    count_title_absent_3 = 0
    count_title_absent_4 = 0
    count_title_absent_5 = 0
    total_absent_docs = 0
    

    for entry in data_list:
        total_title_absent += len(entry['title_absent_phrase'])
        total_title_present += len(entry['title_present_phrase'])

        if len(entry['title_absent_phrase']) > 0:
            total_absent_docs += 1

        if len(entry['title_absent_phrase']) == 1:
            count_title_absent_1 += 1
        elif len(entry['title_absent_phrase']) == 2:
            count_title_absent_2 += 1
        elif len(entry['title_absent_phrase']) == 3:
            count_title_absent_3 += 1
        elif len(entry['title_absent_phrase']) == 4:
            count_title_absent_4 += 1
        elif len(entry['title_absent_phrase']) >= 5:
            count_title_absent_5 += 1
            

    avg_title_absent = total_title_absent / len(data_list) if data_list else 0
    avg_title_present = total_title_present / len(data_list) if data_list else 0

    total_avg = avg_title_absent + avg_title_present
    percent_title_present = (avg_title_present / total_avg) * 100 if total_avg else 0
    percent_title_absent = 100 - percent_title_present

    return {
        'avg_title_present': avg_title_present,
        'avg_title_absent': avg_title_absent,
        'percent_title_present': percent_title_present,
        'percent_title_absent': percent_title_absent,
        'count_title_absent_1': count_title_absent_1,
        'count_title_absent_2': count_title_absent_2,
        'count_title_absent_3': count_title_absent_3,
        'count_title_absent_4': count_title_absent_4,
        'count_title_absent_5': count_title_absent_5,
        'total_docs': len(data_list),
        'total_absent_docs': total_absent_docs,
        'absent_ratio': total_absent_docs / len(data_list) * 100,
        'absent2_ratio': (count_title_absent_2 + count_title_absent_3 + count_title_absent_4 + count_title_absent_5)/ len(data_list) * 100,
        'absent3_ratio': (count_title_absent_3 + count_title_absent_4 + count_title_absent_5) / len(data_list) * 100,
    }

In [6]:
kp20k_stat = aggregate_data(kp20k)
kp20k_stat

{'avg_title_present': 1.7642885205599879,
 'avg_title_absent': 1.2462297288361075,
 'percent_title_present': 58.60414634303914,
 'percent_title_absent': 41.39585365696086,
 'count_title_absent_1': 195884,
 'count_title_absent_2': 127783,
 'count_title_absent_3': 43086,
 'count_title_absent_4': 10671,
 'count_title_absent_5': 3252,
 'total_docs': 514154,
 'total_absent_docs': 380676,
 'absent_ratio': 74.03929561960035,
 'absent2_ratio': 35.94098266278197,
 'absent3_ratio': 11.087923073631636}

In [7]:
kptimes_stat = aggregate_data(kptimes)
kptimes_stat

{'avg_title_present': 1.330470947165122,
 'avg_title_absent': 1.2827144962161872,
 'percent_title_present': 50.91375931757719,
 'percent_title_absent': 49.08624068242281,
 'count_title_absent_1': 105937,
 'count_title_absent_2': 80820,
 'count_title_absent_3': 19193,
 'count_title_absent_4': 1926,
 'count_title_absent_5': 108,
 'total_docs': 259923,
 'total_absent_docs': 207984,
 'absent_ratio': 80.01754365716,
 'absent2_ratio': 39.260473294014,
 'absent3_ratio': 8.1666493538471}

In [8]:
stackexchange_stat = aggregate_data(stackexchange)
stackexchange_stat

{'avg_title_present': 1.5148361848376901,
 'avg_title_absent': 0.955640292341913,
 'percent_title_present': 61.31757168427237,
 'percent_title_absent': 38.68242831572763,
 'count_title_absent_1': 130333,
 'count_title_absent_2': 55508,
 'count_title_absent_3': 11637,
 'count_title_absent_4': 1911,
 'count_title_absent_5': 347,
 'total_docs': 298965,
 'total_absent_docs': 199736,
 'absent_ratio': 66.80915826267288,
 'absent2_ratio': 23.214423093004196,
 'absent3_ratio': 4.647701235930627}