# Getting started with OpenAssistant OASST1 data

- https://huggingface.co/datasets/OpenAssistant/oasst1

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/master/notebooks/openassistant-oasst1/getting-started.ipynb)

In [1]:
# uncomment and run below lines to set up if running in colab
#!pip install datasets pandas

In [2]:
import pandas as pd
from datasets import load_dataset

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load dataset from huggingface datasets
ds = load_dataset("OpenAssistant/oasst1")

Found cached dataset parquet (C:/Users/andre/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2e4c5fea546c88d6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 15.88it/s]


In [4]:
# lets convert the train dataset to a pandas df
df = ds["train"].to_pandas()

In [5]:
# look at the df info
df.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84437 entries, 0 to 84436
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       84437 non-null  object 
 1   parent_id        74591 non-null  object 
 2   user_id          84437 non-null  object 
 3   created_date     84437 non-null  object 
 4   text             84437 non-null  object 
 5   role             84437 non-null  object 
 6   lang             84437 non-null  object 
 7   review_count     84437 non-null  int32  
 8   review_result    83732 non-null  object 
 9   deleted          84437 non-null  bool   
 10  rank             48730 non-null  float64
 11  synthetic        84437 non-null  bool   
 12  model_name       0 non-null      object 
 13  detoxify         72297 non-null  object 
 14  message_tree_id  84437 non-null  object 
 15  tree_state       84437 non-null  object 
 16  emojis           71496 non-null  object 
 17  labels      

In [9]:
# look at a sample row in a json format we can easily read
df.sample(1).transpose().to_dict()

{35238: {'message_id': 'cb9c1893-2eda-42ee-b0a7-adeb0e142867',
  'parent_id': 'd24480be-94bd-4dc8-b7df-5eb973ed786f',
  'user_id': '50f76166-c558-47be-80b1-5e2b2df6ff09',
  'created_date': '2023-02-07T11:53:26.527725+00:00',
  'text': 'Good job, are there other riddles like this that you know?',
  'role': 'prompter',
  'lang': 'en',
  'review_count': 3,
  'review_result': True,
  'deleted': False,
  'rank': nan,
  'synthetic': False,
  'model_name': None,
  'detoxify': {'toxicity': 0.021441340446472168,
   'severe_toxicity': 3.3909338526427746e-05,
   'obscene': 0.0014250462409108877,
   'identity_attack': 0.0003495119162835181,
   'insult': 0.012425446882843971,
   'threat': 5.629667793982662e-05,
   'sexual_explicit': 7.260362326633185e-05},
  'message_tree_id': '63f26a43-8b7d-4bb2-ad36-4309034c84cd',
  'tree_state': 'ready_for_export',
  'emojis': {'name': array(['+1', '_skip_reply'], dtype=object),
   'count': array([2, 1])},
  'labels': {'name': array(['spam', 'lang_mismatch', 'pi

In [7]:
# lets grab a random message tree
message_tree_id = df["message_tree_id"].sample(1).values[0]
print(message_tree_id)

e63d94b6-6f92-410f-b9d8-5cbd9367f2e3


In [8]:
# look at all data for this message tree
df.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
71993,e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,,a63181b7-107c-4fcc-913c-6871e68a9ffe,2023-02-05T22:58:35.549190+00:00,can you tell me a few good strategies to win a...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0033605224452912807, 'severe_to...",e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
71996,8a58e588-f3c4-4190-a1e4-538c86abb17b,e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,df6e7d47-7c0e-4958-8bc4-4558af0ece38,2023-02-06T03:37:06.246378+00:00,"Sure, here are a few strategies that you can u...",assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.000288925803033635, 'severe_tox...",e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,ready_for_export,"{'name': ['+1'], 'count': [3]}","{'name': ['spam', 'fails_task', 'lang_mismatch..."
71994,4c27698c-01c7-42db-ac25-a2d6af5da4f4,e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,7de84fb1-4cf5-4142-989d-f490fc526bfd,2023-02-06T03:56:59.785890+00:00,Certainly! Here are a few effective tactics fo...,assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.00032153952633962035, 'severe_t...",e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,ready_for_export,"{'name': ['_skip_reply'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch..."
72000,d4154bbe-d137-4f3c-9382-8a3b306402f8,e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,f14aa565-a0ed-4d18-b949-0daccb549f03,2023-02-06T04:21:18.793582+00:00,"Settlers of Catan is a game of strategy, teamw...",assistant,en,3,True,False,2.0,False,,"{'toxicity': 0.000376795360352844, 'severe_tox...",e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,ready_for_export,"{'name': ['+1', '-1'], 'count': [1, 1]}","{'name': ['spam', 'fails_task', 'lang_mismatch..."
71997,50416bbf-bf73-4022-833c-10c7ca062ad6,8a58e588-f3c4-4190-a1e4-538c86abb17b,0b53f16f-910a-453b-8101-dfe64c6dd30c,2023-02-06T06:23:21.276590+00:00,Do the Settlers of Catan expansions require di...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0004958086065016687, 'severe_to...",e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,ready_for_export,"{'name': ['_skip_reply', '_skip_ranking'], 'co...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
72001,81ba6358-9ac8-45b2-a202-723b605b8c95,d4154bbe-d137-4f3c-9382-8a3b306402f8,6a4224c2-6b7f-4981-85d3-5b26a75a5324,2023-02-06T08:25:07.775154+00:00,What are good long term strategies for getting...,prompter,en,3,True,False,,False,,"{'toxicity': 0.00035797784221358597, 'severe_t...",e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
71999,ed3fe459-77eb-4dd3-9672-46e5dcddbd2e,50416bbf-bf73-4022-833c-10c7ca062ad6,224c3d09-c385-46cd-93e2-5f3682ac2624,2023-02-06T08:43:38.464529+00:00,"The expansions in ""Settlers of Catan"" can have...",assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.00036860740510746837, 'severe_t...",e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,ready_for_export,"{'name': ['+1'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch..."
71998,0936e043-4cbb-4441-adaa-e5443c4cd99d,50416bbf-bf73-4022-833c-10c7ca062ad6,03d8200e-9023-423c-a3f0-47828a5b4189,2023-02-06T08:58:01.473538+00:00,"Yes, the expansions for Settlers of Catan can ...",assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.00033097557025030255, 'severe_t...",e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,ready_for_export,"{'name': ['_skip_labeling'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch..."
71995,43431488-6f0f-481e-9001-8407ab25b6cf,4c27698c-01c7-42db-ac25-a2d6af5da4f4,ecaa4818-48da-45a8-bab5-ae6de585ad8a,2023-02-06T10:19:04.483254+00:00,Is there an online community? Can i play it on...,prompter,en,3,True,False,,False,,"{'toxicity': 0.000970872410107404, 'severe_tox...",e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,ready_for_export,,"{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
72003,c0e39213-4fd0-4612-ab34-f5bc2b74300e,81ba6358-9ac8-45b2-a202-723b605b8c95,8a9d78b7-5b39-4033-8829-1b5486fdba76,2023-02-06T11:53:46.248417+00:00,A good long term strategy for getting the long...,assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0003667851269710809, 'severe_to...",e63d94b6-6f92-410f-b9d8-5cbd9367f2e3,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [3...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
