### 0. Load Library & Data

In [None]:
# Library
import os
import torch
os.chdir(os.path.join(os.getcwd(), '..', 'maverick-coref-main'))
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
import jsonlines
import pandas as pd
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')
import ast
from maverick import Maverick

# Configuration
DATAFRAME_FOLDER_PATH = os.makedirs("../data/Dataframe/", exist_ok=True)
DATAFRAME_FOLDER_PATH = '../data/Dataframe/'
LITBANK_CASE_FOLDER_PATH = os.makedirs("../data/Litbank_Case/", exist_ok=True)
LITBANK_CASE_FOLDER_PATH = "../data/Litbank_Case/"

# Data
LitBank_train=[]
with jsonlines.open("../data/litbank/train.english.jsonlines") as read_file:
    for line in read_file.iter():
        LitBank_train.append(line)
        
PreCo_train=[]
with jsonlines.open("../data/preco/train.jsonl") as read_file:
    for line in read_file.iter():
        PreCo_train.append(line)

### Pre-process 

In [8]:
df_LBtrain = pd.DataFrame(LitBank_train)[['doc_key', 'sentences', 'clusters']]
# df_LBtrain.to_csv(DATAFRAME_FOLDER_PATH+'df_LBtrain.csv', index=False)
df_LBtrain = pd.read_csv(DATAFRAME_FOLDER_PATH+'df_LitBank_train.csv')
df_LBtrain['sentences'] = [ast.literal_eval(data) for data in df_LBtrain['sentences']]
df_LBtrain['clusters'] = [ast.literal_eval(data) for data in df_LBtrain['clusters']]
df_LBtrain.head(3)

Unnamed: 0,doc_key,sentences,clusters
0,217_sons_and_lovers_brat_0,"[[PART, ONE, CHAPTER, I, THE, EARLY, MARRIED, ...","[[[9, 10], [900, 900], [905, 905]], [[744, 744..."
1,805_this_side_of_paradise_brat_0,"[[BOOK, ONE, --, The, Romantic, Egotist, CHAPT...","[[[3, 5]], [[460, 460], [653, 653], [1906, 190..."
2,1695_the_man_who_was_thursday_a_nightmare_brat_0,"[[CHAPTER, I, .], [THE, TWO, POETS, OF, SAFFRO...","[[[3, 5]], [[182, 183], [406, 406], [103, 103]..."


In [32]:
def calculate_new_offsets(ontonotes_sentence, coreference_offsets):
    """
    주어진 coreference offset이 포함된 문장만 추출하여 순서대로 정리하고,
    문장이 일부만 추출되었기 때문에 offset 값을 새로운 기준에 맞게 조정한다.
    
    Args:
        ontonotes_sentence (list of lists): 전체 문장 리스트 (각 문장은 단어 리스트)
        coreference_offsets (list of lists): coreference 표현의 시작/끝 단어 인덱스 쌍

    Returns:
        tuple: 추출된 문장 리스트, 새롭게 조정된 offset 리스트
    """
    unique_sentences = []
    adjusted_offsets = []
    
    # 누적 길이를 추적하여 새 offset 계산에 활용
    cumulative_length = 0
    previous_length = 0

    for offset in coreference_offsets:
        for sentence_idx, sentence in enumerate(ontonotes_sentence):
            sentence_start = sum(len(s) for s in ontonotes_sentence[:sentence_idx])
            sentence_end = sentence_start + len(sentence) - 1

            # 현재 offset이 해당 문장 범위에 포함되는지 확인
            if sentence_start <= offset[0] <= sentence_end:
                if sentence not in unique_sentences:
                    unique_sentences.append(sentence)
                    cumulative_length += previous_length  # 앞선 문장 길이 누적
                # 새로운 offset 계산
                new_offset_start = offset[0] - sentence_start
                new_offset_end = new_offset_start + (offset[1] - offset[0])
                adjusted_offset = [
                    new_offset_start + cumulative_length,
                    new_offset_end + cumulative_length
                ]
                adjusted_offsets.append(adjusted_offset)
                previous_length = len(unique_sentences[-1])
                break

    return unique_sentences, adjusted_offsets

def find_coreference_terms(ontonotes_format, clusters_token_offsets):
    """
    주어진 offset으로부터 coreference cluster에 해당하는 실제 단어들을 추출한다.

    Args:
        ontonotes_format (list of list): 전체 문장 구조 (단어 리스트의 리스트)
        clusters_token_offsets (list of lists): 각 cluster에 속하는 단어 범위(offset)

    Returns:
        list: 각 cluster별로 단어 그룹을 저장한 리스트
    """
    result = []
    sentence_lengths = [len(sentence) for sentence in ontonotes_format]  # 문장별 길이

    for cluster in clusters_token_offsets:
        if len(cluster) == 1:
            continue  # 하나만 있으면 coref로 볼 수 없으므로 skip

        cluster_terms = []
        for start, end in cluster:
            sentence_idx = 0
            for i, length in enumerate(sentence_lengths):
                if start < length:
                    sentence_idx = i
                    break
                start -= length
                end -= length
            # 해당 문장에서 단어 추출
            terms = ontonotes_format[sentence_idx][start:end+1]
            cluster_terms.append(terms)
        result.append(cluster_terms)

    return result

def sort_by_first_value(data):
    """
    주어진 리스트를 각 원소의 첫 번째 값 기준으로 정렬한다.
    예: [[5, 6], [1, 2]] -> [[1, 2], [5, 6]]
    """
    return sorted(data, key=lambda x: x[0])

In [33]:
for index in range(len(df_LBtrain)):  # LitBank train 데이터셋에서 각 케이스(index)에 대해 반복
    coref_list = []
    litbank_case = pd.DataFrame()

    # 문자열 형태로 저장된 데이터를 실제 리스트로 변환
    for col in litbank_case.columns:
        litbank_case[col] = litbank_case[col].apply(ast.literal_eval)

    # Coreference cluster 추출
    for data in df_LBtrain['clusters'][index]:
        coref = data
        if len(coref) == 1:
            continue  # 하나만 있는 경우는 생략
        coref_list.append(coref)
        
    litbank_case['coref'] = coref_list

    # 문장과 offset 추출 및 재조정
    extracted_sentence_list, adjusted_offsets_list = [], []
    for data in litbank_case['coref']:
        extracted_sentence, adjusted_offsets = calculate_new_offsets(df_LBtrain['sentences'][index], data)
        extracted_sentence_list.append(extracted_sentence)
        adjusted_offsets_list.append(adjusted_offsets)
        
    litbank_case['extracted_sentence'] = extracted_sentence_list
    litbank_case['adjusted_offsets'] = adjusted_offsets_list

    # offset 정렬
    litbank_case['adjusted_offsets'] = litbank_case['adjusted_offsets'].apply(lambda x: sort_by_first_value(x))

    # 실제 단어 표현 추출
    litbank_case['text'] = find_coreference_terms(df_LBtrain['sentences'][index], litbank_case['coref'])

    # CSV 파일로 저장 (index에 따라 파일명 구분)
    litbank_case.to_csv(f'{LITBANK_CASE_FOLDER_PATH}litbank_case_{index}.csv', index=False)


In [35]:
litbank_case_0 = pd.read_csv("../data/Litbank_Case/litbank_case_0.csv")
litbank_case_0.head(3)

Unnamed: 0,coref,extracted_sentence,adjusted_offsets,text
0,"[[9, 10], [900, 900], [905, 905]]","[['PART', 'ONE', 'CHAPTER', 'I', 'THE', 'EARLY...","[[9, 10], [22, 22], [27, 27]]","[['THE', 'MORELS'], ['They'], ['their']]"
1,"[[744, 744], [725, 733], [874, 875], [680, 681...","[['Mrs.', 'Morel', 'was', 'not', 'anxious', 't...","[[27, 27], [8, 16], [52, 53], [64, 65], [113, ...","[['it'], ['the', 'Bottoms', ',', 'which', 'was..."
2,"[[450, 454], [18, 19], [22, 23], [247, 250], [...","[['To', 'accommodate', 'the', 'regiments', 'of...","[[35, 39], [64, 65], [68, 69], [90, 93], [116,...","[['the', 'site', 'of', 'Hell', 'Row'], ['Hell'..."
