In [2]:
!pip install tqdm

Collecting tqdm
  Using cached tqdm-4.66.2-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.2


### Make Event and  IOU filtered corpus

In [4]:
import pickle as pkl
import os, sys
from tqdm.notebook import tqdm_notebook

def convert_segment_dict_to_events_dataset(segment_dir, out_path):
    if out_path[-1] != "/":
        out_path = out_path + f'/'

    assert(os.path.isdir(segment_dir))
    for file in os.listdir(segment_dir):
        file = segment_dir + f'/{file}'
        if "query" in file :
            query_events = file
            with open(query_events, 'rb') as f:
                query_events = pkl.load(f)
                query_events = query_events['dict_query']

        elif "candidate" in file :
            candidate_events = file
            with open(candidate_events, 'rb') as f:
                candidate_events = pkl.load(f)
                candidate_events = candidate_events['dict_candidate']

    # with open(f'./temp.txt', 'w+') as f:
    #     print(candidate_events, file = f)

    os.makedirs(out_path, exist_ok=True)
    os.makedirs(out_path + 'query/', exist_ok=True)
    os.makedirs(out_path + 'candidate/', exist_ok=True)

    for query_num in query_events.keys():
        text = " ".join(query_events[query_num])
        temp_path = out_path + f'query/{query_num:010d}.txt'
        with open(temp_path, 'w') as f:
            f.write(text)

    for candidate_num in candidate_events.keys():
        text = " ".join(candidate_events[candidate_num])
        temp_path = out_path + f'candidate/{candidate_num:010d}.txt'
        with open(temp_path, 'w') as f:
            f.write(text)

    return 

def convert_segment_dict_to_iouf_dataset(segment_dir, out_path):
    if out_path[-1] != "/":
        out_path = out_path + f'/'

    assert(os.path.isdir(segment_dir))
    for file in os.listdir(segment_dir):
        file = segment_dir + f'/{file}'
        if "IOU" not in file :
            continue

        with open(file, 'rb') as f: # single file contains query and candidate corpus
            _ = pkl.load(f) 
        query_events = _['dict_query']
        candidate_events = _['dict_candidate']

    os.makedirs(out_path, exist_ok=True)
    os.makedirs(out_path + 'query/', exist_ok=True)
    os.makedirs(out_path + 'candidate/', exist_ok=True)

    for query_num in query_events.keys():
        text = " ".join(query_events[query_num])
        temp_path = out_path + f'query/{query_num:010d}.txt'
        with open(temp_path, 'w') as f:
            f.write(text)

    for candidate_num in candidate_events.keys():
        text = " ".join(candidate_events[candidate_num])
        temp_path = out_path + f'candidate/{candidate_num:010d}.txt'
        with open(temp_path, 'w') as f:
            f.write(text)

    return 

# events for ILPCR dataset 
# standard corpus, with citations
convert_segment_dict_to_events_dataset('../segment_dictionaries/ilpcr_processed/ilpcr/test', './corpus/ik_test_events')
convert_segment_dict_to_events_dataset('../segment_dictionaries/ilpcr_processed/ilpcr/train', './corpus/ik_train_events')

# without citations
convert_segment_dict_to_events_dataset('../segment_dictionaries/ilpcr_woc_processed/ilpcr_woc/test', './corpus/sentence_removed/ik_test_events')
convert_segment_dict_to_events_dataset('../segment_dictionaries/ilpcr_woc_processed/ilpcr_woc/train', './corpus/sentence_removed/ik_train_events')

# # events for COLIEE2021 dataset
convert_segment_dict_to_events_dataset('../segment_dictionaries/coliee21_processed/coliee21/test', './corpus/COLIEE2021_test_events')
convert_segment_dict_to_events_dataset('../segment_dictionaries/coliee21_processed/coliee21/train', './corpus/COLIEE2021_train_events')

# iouf corpus for ILPCR dataset
convert_segment_dict_to_iouf_dataset('../segment_dictionaries/ilpcr_processed/ilpcr/test', './corpus/ik_test_iouf')
convert_segment_dict_to_iouf_dataset('../segment_dictionaries/ilpcr_processed/ilpcr/train', './corpus/ik_train_iouf')
convert_segment_dict_to_iouf_dataset('../segment_dictionaries/ilpcr_woc_processed/ilpcr_woc/test', './corpus/sentence_removed/ik_test_iouf')
convert_segment_dict_to_iouf_dataset('../segment_dictionaries/ilpcr_woc_processed/ilpcr_woc/train', './corpus/sentence_removed/ik_train_iouf')

# iouf corpus for COLIEE2021 dataset
convert_segment_dict_to_iouf_dataset('../segment_dictionaries/coliee21_processed/coliee21/test', './corpus/COLIEE2021_test_iou_filtered/')
convert_segment_dict_to_iouf_dataset('../segment_dictionaries/coliee21_processed/coliee21/train', './corpus/COLIEE2021_train_iou_filtered/')

AssertionError: 

### Make atomic events corpus

In [None]:
def convert_segment_dict_to_atomic_events_dataset(segment_dir, out_path):
    if out_path[-1] != "/":
        out_path = out_path + f'/'

    assert(os.path.isdir(segment_dir))
    for file in os.listdir(segment_dir):
        file = segment_dir + f'/{file}'
        if "query" in file :
            query_events = file
            with open(query_events, 'rb') as f:
                query_events = pkl.load(f)
                query_events = query_events['dict_query']

        elif "candidate" in file :
            candidate_events = file
            with open(candidate_events, 'rb') as f:
                candidate_events = pkl.load(f)
                candidate_events = candidate_events['dict_candidate']

    token_dict = {}
    counter=0
    for query in query_events:
        for event in query_events[query]:
            if event not in token_dict:
                token_dict[event] = counter
                counter+=1

    for candidate in candidate_events:
        for event in candidate_events[candidate]:
            if event not in token_dict:
                token_dict[event] = counter
                counter+=1

    os.makedirs(out_path, exist_ok=True)
    os.makedirs(out_path + 'query/', exist_ok=True)
    os.makedirs(out_path + 'candidate/', exist_ok=True)

    for query_num in query_events.keys():
        text = ". ".join([str(token_dict[i]) for i in query_events[query_num]])
        temp_path = out_path + f'query/{query_num:010d}.txt'
        with open(temp_path, 'w') as f:
            f.write(text)

    for candidate_num in candidate_events.keys():
        text = ". ".join([str(token_dict[i]) for i in candidate_events[candidate_num]])
        temp_path = out_path + f'candidate/{candidate_num:010d}.txt'
        with open(temp_path, 'w') as f:
            f.write(text)
    
    return

# Atomic event corpus for ILPCR
convert_segment_dict_to_atomic_events_dataset('../segment_dictionaries/ilpcr_processed/ilpcr/test', './corpus/ik_test_atomic')
convert_segment_dict_to_atomic_events_dataset('../segment_dictionaries/ilpcr_processed/ilpcr/train', './corpus/ik_train_atomic')
convert_segment_dict_to_atomic_events_dataset('../segment_dictionaries/ilpcr_woc_processed/ilpcr_woc/test', './corpus/sentence_removed/ik_test_atomic')
convert_segment_dict_to_atomic_events_dataset('../segment_dictionaries/ilpcr_woc_processed/ilpcr_woc/train', './corpus/sentence_removed/ik_train_atomic')

# Atomic event corpus for COLIEE2021 dataset
convert_segment_dict_to_atomic_events_dataset('../segment_dictionaries/coliee21_processed/coliee21/test', './corpus/COLIEE2021_test_atomic')
convert_segment_dict_to_atomic_events_dataset('../segment_dictionaries/coliee21_processed/coliee21/train', './corpus/COLIEE2021_train_atomic')