# Data Preparation

Basic data preparation routines for text generation

In [2]:
import json
import os
import re
import pprint
import random
import uuid
import jsonlines

import numpy as np
import pandas as pd

random.seed(42)

In [3]:
def get_pair(entry):
    entry = entry.split("Human:")
    entry = entry[1]
    entry = entry.split("Assistant:")
    # print(entry)
    return entry[0], entry[1]


def split_train_test_val(data, train_ratio: float, val_ratio: float):
    random.shuffle(data)

    train_cutoff = int(len(data) * train_ratio)
    val_cutoff = int(len(data) * (train_ratio + val_ratio))
    train, val, test = (
        data[:train_cutoff],
        data[train_cutoff:val_cutoff],
        data[val_cutoff:],
    )

    assert len(train) + len(val) + len(test) == len(
        data
    ), "Something went wrong with the split"

    return train, val, test


def split_train_test(data, train_ratio: float):
    random.shuffle(data)

    train_cutoff = int(len(data) * train_ratio)
    train, test = (data[0:train_cutoff], data[train_cutoff:])

    assert len(train) + len(test) == len(data), "Something went wrong with the split"
    #print(len(train), len(test))
    return train, test


get_pair(
    "Human: TEST TEST \n\n Assistant: F F F F \n\n Human: TEST TEST \n\n Assistant: A A A A"
)

(' TEST TEST \n\n ', ' F F F F \n\n ')

## Data Preparation

In [4]:
test = []
train = []
val = []
small_test = []

RLHF_RATIO = 0.5

# Add MATH Data
for d in zip(["train", "test", "val"], [train, test, val]):
    contents = json.load(open(f"../../data/MATH/MATH.{d[0]}.json"))
    d[1].extend(contents)
    print(f"MATH Size of {d[0]}: {len(contents)}")
    if d[0] == "test":
        small_test.extend(random.sample(contents, 20))

print("\n------------------------------\n")

# Add MathQA Data
for d in zip(["train", "test", "val"], [train, test, val]):
    contents = json.load(open(f"../../data/MathQA/mathqa.{d[0]}.json"))
    d[1].extend(contents)
    print(f"MathQA Size of {d[0]}: {len(contents)}")
    if d[0] == "test":
        small_test.extend(random.sample(contents, 20))

print("\n------------------------------\n")

# Add data for StackExchange
#for d in zip(["train", "test", "val"], [train, test, val]):
#    contents = json.load(open(f"../../data/StackExchange/stack.{d[0]}.json"))
#    d[1].extend(contents)
#    print(f"StackExchange Size of {d[0]}: {len(contents)}")

print("\n------------------------------\n")

# Add data for Interactions
for d in zip(["train", "test", "val"], [train, test, val]):
    contents = json.load(open(f"../../data/Interactions/interactions.{d[0]}.json"))
    d[1].extend(contents)
    print(f"Interactions Size of {d[0]}: {len(contents)}")
    if d[0] == "test":
        small_test.extend(random.sample(contents, 20))


print("\n------------------------------\n")


# Save All
print(f"Total Size of data(train, test, val): {len(train), len(test), len(val)}")
for d in zip(["train", "test", "val"], [train, test, val]):
    RLFH_data, supervised_data = split_train_test(d[1].copy(), RLHF_RATIO)
    for s in zip(["RLHF", "supervised"], [RLFH_data, supervised_data]):
        json.dump(s[1], open(f"../../dataset/{s[0]}/{d[0]}.json", "w"), indent=4)
        print(
            f"{d[0]}-{s[0]} data size: {len(s[1])}, saved at ../../dataset/{s[0]}/{d[0]}.json"
        )
    print()


print("------------------------------\n")


# Save all for submission
all_data = train + test + val
json.dump(all_data, open(f"../../gen_dataset_chatMGL.json", "w"), indent=4)
print(f"Total Size of data: {len(all_data)} saved at {f'../../gen_dataset_chatMGL.json'}")

json.dump(small_test, open(f"../../dataset/Test/small_test.json", "w"), indent=4)
print(f"Total Size of small test data: {len(small_test)} saved at {f'../../dataset/Test/small_test.json'}")

MATH Size of train: 6753
MATH Size of test: 5000
MATH Size of val: 747

------------------------------

MathQA Size of train: 320
MathQA Size of test: 40
MathQA Size of val: 40

------------------------------


------------------------------

Interactions Size of train: 1409
Interactions Size of test: 177
Interactions Size of val: 176

------------------------------

Total Size of data(train, test, val): (8482, 5217, 963)
train-RLHF data size: 4241, saved at ../../dataset/RLHF/train.json
train-supervised data size: 4241, saved at ../../dataset/supervised/train.json

test-RLHF data size: 2608, saved at ../../dataset/RLHF/test.json
test-supervised data size: 2609, saved at ../../dataset/supervised/test.json

val-RLHF data size: 481, saved at ../../dataset/RLHF/val.json
val-supervised data size: 482, saved at ../../dataset/supervised/val.json

------------------------------

Total Size of data: 14662 saved at ../../gen_dataset_chatMGL.json
Total Size of small test data: 60 saved at ../../

## Data Preprocessing

### MathQA

In [None]:
MATH_QA_DIR = "../data/mathqa/"

for type in ["train", "test", "validation"]:
    raw_data = json.load(
        open(MATH_QA_DIR + f"subset.{type}.annotated.paraphrases.json", "r")
    )
    samples = []
    for entry in raw_data:
        question = entry["prompt"]
        answer = entry["correct_reasoning_paraphrased"]
        guid = str(uuid.uuid4())

        samples.append(
            {
                "guid": guid,
                "question": question,
                "answer": answer,
            }
        )

    file2save = open(MATH_QA_DIR + f"mathqa.{type}.size{len(samples)}.json", "w")
    json.dump(samples, file2save, indent=4)

    print(
        "MathQA file saved at ", MATH_QA_DIR + f"mathqa.{type}.size{len(samples)}.json"
    )

### Stack Exchange

In [10]:
STACK_DIR = "../../data/StackExchange/"

SAMPLES_TO_EXTRACT = 4000
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1

samples = []
with jsonlines.open(STACK_DIR + "stack_exchange_dataset.jsonl", "r") as reader:
    for index, sample in enumerate(reader.iter()):
        correct_chat = sample["correct_chat"]
        question, answer = get_pair(correct_chat)
        guid = str(uuid.uuid4())

        samples.append(
            {
                "guid": guid,
                "question": question.strip(),
                "answer": answer.strip(),
                "source": "StackExchange",
            }
        )

        if index == SAMPLES_TO_EXTRACT - 1:
            break


stack_train, stack_val, stack_test = split_train_test_val(
    samples.copy(), TRAIN_RATIO, VAL_RATIO
)

for d in zip(["train", "test", "val"], [stack_train, stack_test, stack_val]):
    file2save = open(STACK_DIR + f"stack.{d[0]}.json", "w")
    json.dump(d[1], file2save, indent=4)
    file2save.close()

    print(f"{d[0]} data Size: {len(d[1])}, saved at {STACK_DIR + f'stack.{d[0]}.json'}")

train data Size: 3200, saved at ../../data/StackExchange/stack.train.json
test data Size: 400, saved at ../../data/StackExchange/stack.test.json
val data Size: 400, saved at ../../data/StackExchange/stack.val.json


### MATH

In [50]:
# Merge all samples per category
VALIDATION_PERC = 0.1

for type in ["train", "test"]:
    categories = os.listdir("../../data/MATH/raw/" + type)
    print(f"=== === {type} === ===")
    print(categories)
    for category in categories:
        if category == ".DS_Store":
            continue

        files = os.listdir("../../data/MATH/raw/" + type + "/" + category)

        category_samples = []
        for cfile in files:
            data = json.load(
                open("../../data/MATH/raw/" + type + "/" + category + "/" + cfile, "r")
            )
            category_samples.append(data)

        if type == "train":
            train_data = category_samples[
                int(len(category_samples) * VALIDATION_PERC) :
            ]
            validation_data = category_samples[
                : int(len(category_samples) * VALIDATION_PERC)
            ]

            print("\n===========")
            print(category_samples[-1])
            print("Category:", category)
            print(f"Train data size: {len(train_data)}")
            print(f"Validation data size: {len(validation_data)}")

            trainfile2save = open(f"../../data/MATH/{type}/{category}.json", "w")
            json.dump(train_data, trainfile2save, indent=4)
            print(f"Train data saved at ../../data/MATH/{type}/{category}.json")
            trainfile2save.close()

            validationfile2save = open(f"../../data/MATH/val/{category}.json", "w")
            json.dump(validation_data, validationfile2save, indent=4)
            print(f"Validation data saved at ../../data/MATH/val/{category}.json")
            validationfile2save.close()
        else:
            print("\n===========")
            print("Category:", category)
            print(f"Test data size: {len(category_samples)}")

            file2save = open(f"../../data/MATH/{type}/{category}.json", "w")
            json.dump(category_samples, file2save, indent=4)
            print(f"Merged data saved at ../../data/MATH/{type}/{category}.json")
            file2save.close()

=== === train === ===
['counting_and_probability', 'intermediate_algebra', '.DS_Store', 'number_theory', 'precalculus', 'prealgebra', 'geometry', 'algebra']

{'problem': 'A bin has 8 black balls and 7 white balls.  3 of the balls are drawn at random.  What is the probability of drawing 2 of one color and 1 of the other color?', 'level': 'Level 5', 'type': 'Counting & Probability', 'solution': 'The number of ways to draw out 3 balls from 15 is $\\binom{15}{3}=455$.  We can choose 2 black balls and 1 white ball in $\\binom{8}{2}\\binom{7}{1}=196$ ways.  We can pick 1 black ball and 2 white balls in $\\binom{8}{1}\\binom{7}{2}=168$ ways.  Therefore we have $196+168=364$ ways to satisfy the condition, so the answer is $\\dfrac{364}{455}=\\boxed{\\frac{4}{5}}$.'}
Category: counting_and_probability
Train data size: 694
Validation data size: 77
Train data saved at ../../data/MATH/train/counting_and_probability.json
Validation data saved at ../../data/MATH/val/counting_and_probability.json

{'

In [84]:
types = ["train", "val", "test"]

for type in types:
    files = os.listdir("../../data/MATH/" + type)
    all_contents = []
    for cfile in files:
        # print(type, cfile)
        contents = json.load(open("../../data/MATH/" + type + "/" + cfile, "r"))
        for entry in contents:
            all_contents.append(
                {
                    "guid": str(uuid.uuid4()),
                    "question": entry["problem"],
                    "answer": entry["solution"],
                    "source": "MATH",
                    "type": entry["type"],
                }
            )

    json.dump(
        all_contents, open("../../data/MATH/MATH." + type + ".json", "w"), indent=4
    )
    print(f">> Size of {type} data: {len(all_contents)}")

>> Size of train data: 6753
>> Size of val data: 747
>> Size of test data: 5000


### Interactions

In [12]:
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1

samples_raw = json.load(
    open("../../data/interactions/m3_reward_interactions.json", "r")
)

samples = []
for entry in samples_raw:
    correct_chat = entry["chosen"]
    question, answer = get_pair(correct_chat)
    samples.append(
        {
            "guid": str(uuid.uuid4()),
            "question": question.strip(),
            "answer": answer.strip(),
            "source": "Interactions",
        }
    )

train, val, test = split_train_test_val(samples, TRAIN_RATIO, VAL_RATIO)
print(f"Size of interaction train, val, test: {len(train), len(val), len(test)}")

json.dump(train, open("../../data/interactions/interactions.train.json", "w"), indent=4)
json.dump(val, open("../../data/interactions/interactions.val.json", "w"), indent=4)
json.dump(test, open("../../data/interactions/interactions.test.json", "w"), indent=4)

Size of interaction train, val, test: (1409, 176, 177)


### Reward Data

In [82]:
for type in ["test", "train", "val"]:
    filename = f"../../data/Reward/old/reward.{type}.old.json"
    contents = json.load(open(filename, "r"))
    data = []
    for entry in contents:
        correct_entry = entry["chosen"]
        question, answer = get_pair(correct_entry)
        data.append(
            {
                "quid": str(uuid.uuid4()),
                "question": question.strip(),
                "answer": answer.strip(),
            }
        )

    json.dump(data, open(f"../../data/Reward/reward.{type}.json", "w"), indent=4)