In [182]:
import os
import json
import pandas as pd

DATA_PATH = data_path = os.path.join('..', 'data')


def load_labels(split):
    with open(f'new_dialogsum_labels_{split}.json', 'r') as json_file:
        data = json.load(json_file)
        data = {int(i): data[i] for i in data}
    
    return data

def prepare_dataframe():    
    dialogsum_path = os.path.join(DATA_PATH, 'dialogsum', 'DialogSum_Data')
    test_path = os.path.join(dialogsum_path, 'dialogsum.test.jsonl')
    dialogsum_test_df = pd.read_json(test_path, lines=True)
    dialogsum_test_df = dialogsum_test_df.rename(columns={"summary1": "summary"})
    dialogsum_test_df['split'] = 'test'
    
    
    dev_path = os.path.join(dialogsum_path, 'dialogsum.dev.jsonl')
    dialogsum_dev_df = pd.read_json(dev_path, lines=True)
    dialogsum_dev_df['split'] = 'val'

    train_path = os.path.join(dialogsum_path, 'dialogsum.train.jsonl')
    dialogsum_train_df = pd.read_json(train_path, lines=True)
    dialogsum_train_df['split'] = 'train'
    
    dialogsum_df = pd.concat([dialogsum_train_df, dialogsum_dev_df, dialogsum_test_df])
    dialogsum_df.reset_index(inplace=True)
    
    return dialogsum_df


def create_dataset(split, dialogsum_df):
    labels = load_labels(split)
    two_person_df = dialogsum_df[(dialogsum_df['dialogue'].str.contains('#Person3#') == False) & (dialogsum_df['split'] == split)].copy()
    
    for key in labels:
        summary_p1 = ' '.join(labels[key]['Person1'])
        summary_p2 = ' '.join(labels[key]['Person2'])
        two_person_df.loc[key, 'summary_p1'] = summary_p1
        two_person_df.loc[key, 'summary_p2'] = summary_p2
    
    two_person_df.reset_index(drop=True, inplace=True)
    two_person_df = two_person_df.rename(columns={'index': 'id'})
    
    if split == 'test':
        two_person_df = two_person_df.rename(columns={"summary": "summary1"})
        two_person_df.drop('topic', axis=1, inplace=True)
    else:
        for i in range(1, 4):
            for col in ['topic', 'summary']:
                if i == 1 and col == 'summary':
                    continue
                col_name = col + str(i)
                two_person_df.drop(col_name, axis=1, inplace=True)
    
    return two_person_df


def write_df_to_json(df):
    split = df.iloc[0]['split']
    dest_path = os.path.join(DATA_PATH, 'new_dialogsum/')
    if not os.path.exists(dest_path):
        os.mkdir(dest_path)
    
    dest_path = os.path.join(dest_path, f'{split}.json')
    data = [json.loads(row.to_json()) for _, row in df.iterrows()]
    with open(dest_path, 'w') as json_file:
        json.dump(data, json_file)

In [183]:
dialogsum_df = prepare_dataframe()

In [184]:
two_person_test_df = create_dataset('test', dialogsum_df)
two_person_val_df = create_dataset('val', dialogsum_df)
two_person_train_df = create_dataset('train', dialogsum_df)

In [190]:
write_df_to_json(two_person_test_df)
write_df_to_json(two_person_val_df)
write_df_to_json(two_person_train_df)