# Getting started with OpenAssistant OASST1 data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/master/notebooks/openassistant-oasst1/getting-started.ipynb)

In [1]:
# uncomment and run below lines to set up if running in colab
#!pip install datasets pandas

In [2]:
import pandas as pd
from datasets import load_dataset

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds = load_dataset("OpenAssistant/oasst1")

Found cached dataset parquet (C:/Users/andre/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2e4c5fea546c88d6/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 2/2 [00:00<00:00, 28.65it/s]


In [4]:
# lets convert the train dataset to a pandas df
df = ds["train"].to_pandas()

In [5]:
# look at the df info
df.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84437 entries, 0 to 84436
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       84437 non-null  object 
 1   parent_id        74591 non-null  object 
 2   user_id          84437 non-null  object 
 3   created_date     84437 non-null  object 
 4   text             84437 non-null  object 
 5   role             84437 non-null  object 
 6   lang             84437 non-null  object 
 7   review_count     84437 non-null  int32  
 8   review_result    83732 non-null  object 
 9   deleted          84437 non-null  bool   
 10  rank             48730 non-null  float64
 11  synthetic        84437 non-null  bool   
 12  model_name       0 non-null      object 
 13  detoxify         72297 non-null  object 
 14  message_tree_id  84437 non-null  object 
 15  tree_state       84437 non-null  object 
 16  emojis           71496 non-null  object 
 17  labels      

In [6]:
# look at a sample row in a json format we can easily read
df.sample(1).transpose().to_dict()

{31758: {'message_id': '42d50c60-2e31-437b-bdce-6f0bce91f650',
  'parent_id': '6c8de248-d3e1-48f2-802e-73d5c2b49a6e',
  'user_id': 'd2365416-7edb-4790-a657-29b6ed51d8f3',
  'created_date': '2023-02-05T00:51:09.925052+00:00',
  'text': 'Yes. Fashion and personal style choices are a form of self-expression, and there are no gender-based restrictions on what a person can wear.',
  'role': 'assistant',
  'lang': 'en',
  'review_count': 3,
  'review_result': True,
  'deleted': False,
  'rank': 1.0,
  'synthetic': False,
  'model_name': None,
  'detoxify': {'toxicity': 0.00031964911613613367,
   'severe_toxicity': 3.6933553928975016e-05,
   'obscene': 0.00021374155767261982,
   'identity_attack': 0.00012134548887843266,
   'insult': 0.00014460098464041948,
   'threat': 4.0733481000643224e-05,
   'sexual_explicit': 3.8567017327295616e-05},
  'message_tree_id': '10e186a5-7a66-4154-984e-82e5ff05dfd3',
  'tree_state': 'ready_for_export',
  'emojis': None,
  'labels': {'name': array(['spam', 'fai

In [7]:
# lets grab a random message tree
message_tree_id = df["message_tree_id"].sample(1).values[0]
print(message_tree_id)

9dd36dd3-99e9-4f42-ac30-709ac6fc600e


In [8]:
# look at all data for this message tree
df.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
50265,9dd36dd3-99e9-4f42-ac30-709ac6fc600e,,56ad4815-b98a-4d50-a990-650bc9b199ba,2023-02-10T03:06:19.644719+00:00,¿Por qué los colores no existen?,prompter,es,3,True,False,,False,,"{'toxicity': 0.0006504451157525182, 'severe_to...",9dd36dd3-99e9-4f42-ac30-709ac6fc600e,ready_for_export,"{'name': ['+1', '_skip_reply'], 'count': [3, 3]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
50268,ede665ec-9c8f-4a73-bd68-20b64ebcea82,9dd36dd3-99e9-4f42-ac30-709ac6fc600e,1d5ad424-3710-4cdf-894f-82adbb00ba52,2023-02-10T08:23:00.270735+00:00,El color no está en las cosas sino en la luz q...,assistant,es,3,True,False,2.0,False,,"{'toxicity': 0.0010371938114985824, 'severe_to...",9dd36dd3-99e9-4f42-ac30-709ac6fc600e,ready_for_export,"{'name': ['-1'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch..."
50267,49799c87-477c-4ffd-98d8-1d5af4e6d196,9dd36dd3-99e9-4f42-ac30-709ac6fc600e,1823c6f8-e4ae-424a-b9a5-6d8e21ada7da,2023-02-10T08:23:37.635484+00:00,El color es una propiedad de la materia que se...,assistant,es,3,True,False,1.0,False,,"{'toxicity': 0.000575148151256144, 'severe_tox...",9dd36dd3-99e9-4f42-ac30-709ac6fc600e,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [1...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
50266,ab023931-340e-4909-a8e1-a07a64bf8b45,9dd36dd3-99e9-4f42-ac30-709ac6fc600e,c68ad8c7-1269-4137-9eda-417eec9c46cc,2023-02-10T10:20:50.321351+00:00,Los colores son subjetivos y dependen de la pe...,assistant,es,3,True,False,0.0,False,,"{'toxicity': 0.0004009304684586823, 'severe_to...",9dd36dd3-99e9-4f42-ac30-709ac6fc600e,ready_for_export,"{'name': ['_skip_labeling'], 'count': [2]}","{'name': ['spam', 'fails_task', 'lang_mismatch..."
