In [1]:
import os
import hashlib

In [14]:
# inputs
data_dir = "C:/Users/andre/Downloads"
data_out_dir = f"{data_dir}/oasst2"
raw_input_data_path = f"{data_dir}/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl"
instructions_path = f"{data_dir}/instructions.xlsx"
trees_filename = f"2023-11-05_oasst_all.trees.jsonl"
trees_ready_filename = f"2023-11-05_oasst_all.trees.ready_for_export.jsonl"
messages_filename = f"2023-11-05_oasst_all.messages.jsonl"
messages_ready_filename = f"2023-11-05_oasst_all.messages.ready_for_export.jsonl"
messages_train_filename = f"2023-11-05_oasst_all.messages.train.jsonl"
messages_ready_train_filename = (
    f"2023-11-05_oasst_all.messages.ready_for_export.train.jsonl"
)
messages_validation_filename = f"2023-11-05_oasst_all.messages.validation.jsonl"
messages_ready_validation_filename = (
    f"2023-11-05_oasst_all.messages.ready_for_export.validation.jsonl"
)

# make data_out_dir if it doesn't exist
if not os.path.exists(data_out_dir):
    os.makedirs(data_out_dir)

In [3]:
# print hashes of input files

print(f"Hash of input data: {raw_input_data_path}")
with open(raw_input_data_path, "rb") as f:
    print(hashlib.md5(f.read()).hexdigest())

print(f"Hash of Instructions: {instructions_path}")
with open(instructions_path, "rb") as f:
    print(hashlib.md5(f.read()).hexdigest())

Hash of input data: C:/Users/andre/Downloads/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl
8223a0083f70749ecf430b4057c50dc4
Hash of Instructions: C:/Users/andre/Downloads/instructions.xlsx
99e7a311f473b08781fad2b1855243dc


In [4]:
# use instructions file to clean the raw dataset
!python ../examples/clean_dataset.py \
    "{raw_input_data_path}" \
    "{data_out_dir}/{trees_filename}" \
    --instructions "{instructions_path}"

Reading: C:/Users/andre/Downloads/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl
Loaded 70673 trees with 208686 messages.
Cleaning...
Cleaning id=36be40c8-2451-4b92-99b9-97f425f6955b
Action=Edit
edit: 36be40c8-2451-4b92-99b9-97f425f6955b
Cleaning id=e6933f01-4183-45bf-892c-31bdf778eee0
Action=Delete
deleting: e6933f01-4183-45bf-892c-31bdf778eee0
Tree deleted: e6933f01-4183-45bf-892c-31bdf778eee0
Cleaning id=449a995f-29a4-4d04-aa56-2dab1747a417
Action=Delete
deleting: 449a995f-29a4-4d04-aa56-2dab1747a417
Tree deleted: 449a995f-29a4-4d04-aa56-2dab1747a417
Cleaning id=4b6c83d4-b6c1-452e-b57a-a09bec46a887
Action=Delete
deleting: 4b6c83d4-b6c1-452e-b57a-a09bec46a887
Tree deleted: 4b6c83d4-b6c1-452e-b57a-a09bec46a887
Cleaning id=af60f432-7fb2-4f63-b73a-6092a35bf21b
Action=Delete
deleting: af60f432-7fb2-4f63-b73a-6092a35bf21b
Tree deleted: af60f432-7fb2-4f63-b73a-6092a35bf21b
Cleaning id=5ab3a8bd-bf74-4331-9564-c7e9a663de8e
Action=Edit
edit: 5ab3a8bd-bf74-4331-9564-c7e9a663de8e
Cleaning id=1

In [5]:
# run keyword flagging
!python ./keyword_flagging.py \
    "{data_out_dir}/{trees_filename}" \
    "{data_out_dir}"
    
# outputs have been manually reviewed and appended to instructions file and notebook has been rerun

Processing file: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl


In [13]:
# filter trees to make a version with status ready for export
!python ../examples/filter_trees.py \
    "{data_out_dir}/{trees_filename}" \
    "{data_out_dir}/{trees_ready_filename}" \
    --states "ready_for_export"

Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl
Found 13854 matching trees.
Writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.ready_for_export.jsonl


In [6]:
# convert cleaned dataset from tree to messages
!python ../examples/tree_to_messages.py \
    "{data_out_dir}/{trees_filename}" \
    "{data_out_dir}/{messages_filename}"

reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl
70642 trees with 208584 total messages read.
writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl
208584 messages written.


In [15]:
# convert cleaned state=ready_for_export dataset from tree to messages
!python ../examples/tree_to_messages.py \
    "{data_out_dir}/{trees_ready_filename}" \
    "{data_out_dir}/{messages_ready_filename}"

reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.ready_for_export.jsonl
13854 trees with 135174 total messages read.
writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.jsonl
135174 messages written.


In [7]:
# split messages into train and validation
!python ../examples/split_dataset.py \
    "{data_out_dir}/{messages_filename}" \
    --train_output "{data_out_dir}/{messages_train_filename}" \
    --val_output "{data_out_dir}/{messages_validation_filename}"

Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl
Found 208584 matching messages.
Writing train 198293 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.train.jsonl
Writing valid 10291 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.validation.jsonl


In [16]:
# split ready messages into train and validation
!python ../examples/split_dataset.py \
    "{data_out_dir}/{messages_ready_filename}" \
    --train_output "{data_out_dir}/{messages_ready_train_filename}" \
    --val_output "{data_out_dir}/{messages_ready_validation_filename}"

Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.jsonl
Found 135174 matching messages.
Writing train 128412 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.train.jsonl
Writing valid 6762 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.validation.jsonl


In [8]:
# make .gz files, keeping the original files
!gzip -c "{data_out_dir}/{trees_filename}" > "{data_out_dir}/{trees_filename}.gz"
!gzip -c "{data_out_dir}/{trees_ready_filename}" > "{data_out_dir}/{trees_ready_filename}.gz"
!gzip -c "{data_out_dir}/{messages_filename}" > "{data_out_dir}/{messages_filename}.gz"
!gzip -c "{data_out_dir}/{messages_ready_filename}" > "{data_out_dir}/{messages_ready_filename}.gz"
!gzip -c "{data_out_dir}/{messages_train_filename}" > "{data_out_dir}/{messages_train_filename}.gz"
!gzip -c "{data_out_dir}/{messages_ready_train_filename}" > "{data_out_dir}/{messages_ready_train_filename}.gz"
!gzip -c "{data_out_dir}/{messages_validation_filename}" > "{data_out_dir}/{messages_validation_filename}.gz"
!gzip -c "{data_out_dir}/{messages_ready_validation_filename}" > "{data_out_dir}/{messages_ready_validation_filename}.gz"

In [9]:
# TODO: add detoxify scores
# TODO: generate huggingface parquet format files
# TODO: create readme for oasst2 dataset