# Getting started with OpenAssistant OASST1 data

- https://huggingface.co/datasets/OpenAssistant/oasst1

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/master/notebooks/openassistant-oasst1/getting-started.ipynb)

In [1]:
# uncomment and run below lines to set up if running in colab
#!pip install datasets pandas treelib

# Imports

In [2]:
import pandas as pd
from datasets import load_dataset
from treelib import Tree

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)


def add_tree_level(df):
    """helper function to add tree level to a df"""

    # if tree level already exists, return df
    if "tree_level" in df.columns:
        return df

    else:
        tree_level_map = {}

        # iterate over rows in df
        for i, row in df.iterrows():
            message_id = row["message_id"]
            parent_id = row["parent_id"]

            # if parent_id is None, then it is a root message
            if parent_id is None:
                tree_level_map[message_id] = 0
            # if parent_id is the same as message_tree_id, then it is a direct reply to the root message
            elif parent_id == row["message_tree_id"]:
                tree_level_map[message_id] = 1
            # else just look up the tree level of the parent_id and add 1
            else:
                tree_level_map[message_id] = tree_level_map[parent_id] + 1

        # create a df from the tree_level_map and merge it with the original df
        df_tree_level_map = (
            pd.DataFrame.from_dict(tree_level_map, orient="index", columns=["tree_level"])
            .reset_index()
            .rename(columns={"index": "message_id"})
        )

        return df.merge(df_tree_level_map, on="message_id")

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [3]:
# load dataset from huggingface datasets
ds = load_dataset("OpenAssistant/oasst1")
print(ds)

Found cached dataset parquet (C:/Users/andre/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-ea605663b798f601/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 95.13it/s]

DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})





# Create Pandas Dataframe

In [4]:
# lets convert the train dataset to a pandas df
df = ds["train"].to_pandas()

In [5]:
# look at the df info
df.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84437 entries, 0 to 84436
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       84437 non-null  object 
 1   parent_id        74591 non-null  object 
 2   user_id          84437 non-null  object 
 3   created_date     84437 non-null  object 
 4   text             84437 non-null  object 
 5   role             84437 non-null  object 
 6   lang             84437 non-null  object 
 7   review_count     84437 non-null  int32  
 8   review_result    83732 non-null  object 
 9   deleted          84437 non-null  bool   
 10  rank             48730 non-null  float64
 11  synthetic        84437 non-null  bool   
 12  model_name       0 non-null      object 
 13  detoxify         72297 non-null  object 
 14  message_tree_id  84437 non-null  object 
 15  tree_state       84437 non-null  object 
 16  emojis           71496 non-null  object 
 17  labels      

In [6]:
# look at a sample row in a json format we can easily read
df.sample(1).transpose().to_dict()

{47284: {'message_id': '9303a17a-a84a-42a9-87b3-5a9daa7c6827',
  'parent_id': '1b70b9ff-bc39-4264-a96d-ae186a993917',
  'user_id': 'e2abe732-48a7-48d3-9de8-5af8cb96d029',
  'created_date': '2023-02-09T19:27:48.959399+00:00',
  'text': 'Una paraula amb les 5 vocals, que no comenci amb "a" i que no sigui un verb és, per exemple, minotaure.',
  'role': 'assistant',
  'lang': 'ca',
  'review_count': 3,
  'review_result': True,
  'deleted': False,
  'rank': 1.0,
  'synthetic': False,
  'model_name': None,
  'detoxify': None,
  'message_tree_id': '3ac3157d-5057-43be-b975-2f7665063ca8',
  'tree_state': 'ready_for_export',
  'emojis': {'name': array(['+1'], dtype=object), 'count': array([2])},
  'labels': {'name': array(['spam', 'fails_task', 'lang_mismatch', 'pii', 'not_appropriate',
          'hate_speech', 'sexual_content', 'quality', 'toxicity', 'humor',
          'helpfulness', 'creativity', 'violence'], dtype=object),
   'value': array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.75, 0. 

# Random Message Tree

In [7]:
# lets grab a random message tree
message_tree_id = df["message_tree_id"].sample(1).values[0]
print(message_tree_id)

1ba945f1-1d58-4815-b9f8-dacaca15180a


In [8]:
# look at all data for this message tree
df_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")

# add tree level to df
df_message_tree = add_tree_level(df_message_tree)

df_message_tree

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels,tree_level
0,1ba945f1-1d58-4815-b9f8-dacaca15180a,,77161841-4be2-4f79-8f08-b457b4177ff4,2023-02-04T19:20:58.346053+00:00,Write a ballad on why internet privacy and ope...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0006916996790096164, 'severe_to...",1ba945f1-1d58-4815-b9f8-dacaca15180a,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",0
1,7f5b4e4d-ae41-43b7-a4d4-458cfcb51630,1ba945f1-1d58-4815-b9f8-dacaca15180a,e8453d07-9240-49da-8cb5-2f8d79d37e51,2023-02-05T16:42:21.553867+00:00,"Verse 1\nOpen source and privacy,\nWe need to ...",assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.00038280943408608437, 'severe_t...",1ba945f1-1d58-4815-b9f8-dacaca15180a,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [2...","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
2,781e7fe4-c6b3-4119-a585-2646b0684722,1ba945f1-1d58-4815-b9f8-dacaca15180a,2fb3c88d-3e2f-4bd2-a643-331dbae8af42,2023-02-05T16:55:57.495596+00:00,"There once waw a computer,\na friendly old com...",assistant,en,3,True,False,2.0,False,,"{'toxicity': 0.0007086674449965358, 'severe_to...",1ba945f1-1d58-4815-b9f8-dacaca15180a,ready_for_export,"{'name': ['+1', '-1'], 'count': [3, 6]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
3,fa911681-4848-4efc-9bfa-2b52dc7372ea,1ba945f1-1d58-4815-b9f8-dacaca15180a,490cef32-f0cb-4f01-9270-525e24d561d9,2023-02-05T16:57:07.002312+00:00,Verse 1:\nIn a world of endless clicks and end...,assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.00046211303560994565, 'severe_t...",1ba945f1-1d58-4815-b9f8-dacaca15180a,ready_for_export,"{'name': ['+1'], 'count': [10]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
4,6852b915-5a2c-4558-ae9c-2c7f5566d8e1,fa911681-4848-4efc-9bfa-2b52dc7372ea,9109eaad-017a-46f9-91c5-231cd95d73d0,2023-02-05T17:28:38.315985+00:00,Change the chorus,prompter,en,3,True,False,,False,,"{'toxicity': 0.0005257376469671726, 'severe_to...",1ba945f1-1d58-4815-b9f8-dacaca15180a,ready_for_export,"{'name': ['_skip_reply', '_skip_ranking'], 'co...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",2
5,377d805a-e680-4054-919a-270f95e5ec52,fa911681-4848-4efc-9bfa-2b52dc7372ea,828e7359-02f0-4b9a-a60b-94568279436f,2023-02-05T17:31:19.683738+00:00,Can you modify it to carry an opposite message?,prompter,en,3,True,False,,False,,"{'toxicity': 0.0003970751422457397, 'severe_to...",1ba945f1-1d58-4815-b9f8-dacaca15180a,ready_for_export,"{'name': ['+1'], 'count': [2]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",2
6,3e03cedc-42ac-4d60-a37d-287666a15fe8,7f5b4e4d-ae41-43b7-a4d4-458cfcb51630,c6546bf5-460a-4f99-b387-f3fe7bddd2f5,2023-02-05T18:19:50.000999+00:00,Looks good but can you explain who takes away ...,prompter,en,3,True,False,,False,,"{'toxicity': 0.00044449075357988477, 'severe_t...",1ba945f1-1d58-4815-b9f8-dacaca15180a,ready_for_export,,"{'name': ['spam', 'lang_mismatch', 'pii', 'not...",2
7,1f9ba095-3937-492b-bc65-db313efd6f67,781e7fe4-c6b3-4119-a585-2646b0684722,c6546bf5-460a-4f99-b387-f3fe7bddd2f5,2023-02-05T18:21:20.321209+00:00,You said waw instead of was. Not sure the gram...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0006272009341046214, 'severe_to...",1ba945f1-1d58-4815-b9f8-dacaca15180a,ready_for_export,"{'name': ['+1', '_skip_ranking'], 'count': [6,...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",2
8,ebaed177-20ed-4e4b-8d50-b4337c10cc24,6852b915-5a2c-4558-ae9c-2c7f5566d8e1,0b8c1657-1a06-47e4-9b5a-e196729795f3,2023-02-05T19:10:55.869172+00:00,"Chorus:\nOh internet, let freedom be our guide...",assistant,en,3,True,False,2.0,False,,"{'toxicity': 0.0003795399097725749, 'severe_to...",1ba945f1-1d58-4815-b9f8-dacaca15180a,ready_for_export,,"{'name': ['spam', 'fails_task', 'lang_mismatch...",3
9,43c405d6-437b-4189-8ac5-72eb48357105,6852b915-5a2c-4558-ae9c-2c7f5566d8e1,0dcecbad-be37-4ead-a2e7-af5ac7aedfe2,2023-02-05T19:12:45.246748+00:00,"New Chorus:\nOh internet, where knowledge and ...",assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.00036214423016645014, 'severe_t...",1ba945f1-1d58-4815-b9f8-dacaca15180a,ready_for_export,,"{'name': ['spam', 'fails_task', 'lang_mismatch...",3


## Create Message Tree

In [9]:
# lets create a tree of message ids
id_tree = Tree()
# lets create a tree of message texts
text_tree = Tree()
# lets set a max char length for the text
max_char_len = 100

# iterate over rows in df_message_tree
for i, row in df_message_tree.iterrows():
    # grab the message_id, parent_id, text, and parent text
    message_id = row["message_id"]
    parent_id = row["parent_id"]
    text = row["text"]
    text_short = text[:max_char_len] if len(text) > max_char_len else text
    text_short = text_short.replace("\n", " ")
    parent_text = (
        df_message_tree.query(f"message_id == '{parent_id}'")["text"].values[0] if parent_id is not None else "ROOT"
    )
    parent_text_short = parent_text[:max_char_len] if len(parent_text) > max_char_len else parent_text
    parent_text_short = parent_text_short.replace("\n", " ")

    # create a node in the id_tree and text_tree, add row as data in case want it later
    id_tree.create_node(message_id, message_id, parent=parent_id, data=row.to_dict())

    # if parent_id is None, then it is a root message so dont add parent text as is none
    if parent_id is None:
        text_tree.create_node(text_short, text_short)
    # else use the parent text short as the parent
    else:
        text_tree.create_node(text_short, text_short, parent=parent_text_short)


print("id_tree:")
id_tree.show()

print("text_tree:")
text_tree.show()

id_tree:
1ba945f1-1d58-4815-b9f8-dacaca15180a
├── 781e7fe4-c6b3-4119-a585-2646b0684722
│   └── 1f9ba095-3937-492b-bc65-db313efd6f67
│       ├── 11dd753c-d1de-4d6b-a223-7fe5bb74d084
│       ├── 6a39fde1-daf5-4426-8047-09c531a7689b
│       └── cbaf3202-d894-4b4e-994d-721e8cd8b9fb
├── 7f5b4e4d-ae41-43b7-a4d4-458cfcb51630
│   └── 3e03cedc-42ac-4d60-a37d-287666a15fe8
└── fa911681-4848-4efc-9bfa-2b52dc7372ea
    ├── 377d805a-e680-4054-919a-270f95e5ec52
    └── 6852b915-5a2c-4558-ae9c-2c7f5566d8e1
        ├── 049dfba4-2010-4fb9-8899-86d1985bb31a
        ├── 43c405d6-437b-4189-8ac5-72eb48357105
        └── ebaed177-20ed-4e4b-8d50-b4337c10cc24

text_tree:
Write a ballad on why internet privacy and open-source is cool/important.
├── There once waw a computer, a friendly old computer, that just wanted to do its own thing,  it was ru
│   └── You said waw instead of was. Not sure the grammar is right.
│       ├── Sorry, I did indeed misspell "was".  here is a corrected version:  There once was a co