In [None]:
import json
from tqdm import tqdm

Filter data to only include the test set.

In [None]:
# load testing ids
with open('./data/fisher.txt', 'r') as file:
    fids = file.read().split('\n')
    fids = [x for x in fids if x]

# load the json file with the processed data
with open('./data/processed_data.json',
            'r') as file:
        res = json.load(file)


In [None]:
# filter the utterances that are in the testing set
res2 = {'utterances': [x for x in res['utterances'] if x['utterance_id'] in fids]}

with open('./data/processed_data_test.json', 'w') as file:
    json.dump(res2, file)


Next we need to split the data into prompts and completions. We will use the `train_data_prep.py` script to do this. The script will take the input data and output prompts and completions in the format required by the model. The script will also add the speaker information to the prompts and completions from hyp_spk and hyp_spk_oracle fields in the input data.

```shell
python3 train_data_prep.py \
--input="./data/processed_data_test.json" \
--output="./data/prompts_test.jsonl" \
--output_type=jsonl \
--emit_input_length=2500 \
--emit_target_length=2500 \
--prompt_suffix="" \
--completion_suffix="" \
--input_feature_key="prompt" \
--output_feature_key="completion" \
--text_field="hyp_text" \
--input_speaker_field="hyp_spk" \
--target_speaker_field="hyp_spk_oracle" \
--speaker_prefix="<spk:"
```



Next we need to re-insert the prompts and completions into the input data to be able to evaluate the model.

In [None]:
with open('./data/prompts_test.jsonl', 'r') as file:
    data = [json.loads(x) for x in file]

with open('./data/processed_data_test.json', 'r') as file:
    res = json.load(file)


In [None]:
j = 0
for i in tqdm(range(len(res['utterances']))):
    res['utterances'][i]['completions_ref'] = []
    res['utterances'][i]['completions_llm'] = []
    res['utterances'][i]['prompts'] = []

    while True:
        # access data at index j
        utt_id = data[j]['utterance_id'].split('_seg')[0]
        if utt_id != res['utterances'][i]['utterance_id']:
            break

        # append the data to the lists
        res['utterances'][i]['completions_ref'].append(data[j]['completion'])
        res['utterances'][i]['prompts'].append(data[j]['prompt'])

        j += 1
        if j == len(data):
           break


In [None]:
# save the processed json
with open('./data/full_test.json', 'w') as file:
    json.dump(res, file)
