In [1]:
import os
import json
import numpy as np
import pandas as pd
import tiktoken

base_dir = './'


In [2]:
def read_data(path, allow_pickle=True):
    data = np.load(path, allow_pickle=allow_pickle)
    return data

In [4]:
train_data_dir = base_dir + "SP-train.npy"
val_data_dir = base_dir + "SP_val_question_random.npy"

train_data = read_data(train_data_dir)
val_data = read_data(val_data_dir)

n_train, n_train_select = train_data.shape[0], 3
n_val, n_val_select = val_data.shape[0], 3

## Select static data

In [34]:


target_file = base_dir + "sdata.json"

if not os.path.exists(target_file):
    with open(target_file, 'w') as f:
        data = [train_data[0], train_data[n_train//2], train_data[n_train-1],
                val_data[0], val_data[n_val//2], val_data[n_val-1]]
        f.write('[\n')
        for d in data:
            result = "{\n"
            for k, v in d.items():
                result += f'\t"{k}": "{v}",\n'
            result = result[0:-2] + "\n},\n"
            f.write(result)
        f.write(']')

## Convert train data to csv file

In [16]:
csv_train_file = base_dir + "train_data.csv"
# columns = train_data[0].keys()
columns = ['id', 'question', 'answer', 'distractor1', 'distractor2', 'label']
train_df = pd.DataFrame(columns=columns)
for d in train_data:
    train_df.loc[len(train_df)] = d

In [20]:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-1106")
extra_columns = {'q_token_count': 'question', 'a_token_count': 'answer', 'd1_token_count': 'distractor1', 'd2_token_count': 'distractor2'}
for i in range(len(train_df)):
    for k, v in extra_columns.items():
        if isinstance(train_df.loc[i][v], str):
            train_df.loc[i, k] = len(enc.encode(train_df.loc[i][v]))

In [36]:
for i in range(len(train_df)):
    for k, v in extra_columns.items():
        if isinstance(train_df.loc[i][v], str):
            assert train_df.loc[i, k] == len(enc.encode(train_df.loc[i, v]))
train_df.to_csv(csv_train_file)
train_df.head()

Unnamed: 0,id,question,answer,distractor1,distractor2,label,q_token_count,a_token_count,d1_token_count,d2_token_count
0,SP-0,Mr. and Mrs. Mustard have six daughters and ea...,Each daughter shares the same brother.,Some daughters get married and have their own ...,Some brothers were not loved by family and mov...,1,33.0,7.0,10.0,11.0
1,SP-0_SR,The six daughters of Mr. and Mrs. Mustard each...,Each daughter shares the same brother.,Some daughters get married and have their own ...,Some brothers were not loved by family and mov...,2,31.0,7.0,10.0,11.0
2,SP-0_CR,"A chess team has five players, and each player...",Each player shares the same coach.,Some coaches get a raise.,Some players are backups and not allowed to play.,0,29.0,7.0,6.0,10.0
3,SP-1,A woman shoots her husband. Then she holds him...,The woman was a photographer. She shot a pictu...,The woman gets arrested for murder after dinner.,The woman gets a new partner.,2,45.0,24.0,9.0,7.0
4,SP-1_SR,An individual shoots their spouse. She continu...,The woman was a photographer. She shot a pictu...,The woman gets arrested for murder after dinner.,The woman gets a new partner.,1,43.0,24.0,9.0,7.0


In [43]:
total_token_count = 0
for k, v in extra_columns.items():
    total_token_count += train_df[k].sum()
    print(f'sum {v} token count: {str(train_df[k].sum())}')
print('----------------------------')
print('the number of the total tokens: ' + str(total_token_count))

sum question token count: 21268.0
sum answer token count: 5702.0
sum distractor1 token count: 5403.0
sum distractor2 token count: 5446.0
----------------------------
the number of the total tokens: 37819.0
