# Data Exploration

In this notebook, we will explore the dataset used for training the RLHF model. We will examine its structure, visualize some samples, and understand the features available for training.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("CarperAI/openai_summarize_comparisons")

# Display the dataset structure
print(dataset)

In [2]:
# Explore the training dataset
train_data = dataset["train"]
train_data_df = pd.DataFrame(train_data)

# Display the first few rows of the training dataset
train_data_df.head()

In [3]:
# Visualize the distribution of the lengths of prompts and summaries
train_data_df['prompt_length'] = train_data_df['prompt'].apply(len)
train_data_df['chosen_length'] = train_data_df['chosen'].apply(len)

plt.figure(figsize=(12, 6))
plt.hist(train_data_df['prompt_length'], bins=50, alpha=0.5, label='Prompt Length')
plt.hist(train_data_df['chosen_length'], bins=50, alpha=0.5, label='Chosen Summary Length')
plt.title('Distribution of Prompt and Summary Lengths')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [4]:
# Check for missing values in the dataset
missing_values = train_data_df.isnull().sum()
print(missing_values[missing_values > 0])

In [5]:
# Sample some random examples from the dataset
sampled_data = train_data_df.sample(5)
sampled_data[['prompt', 'chosen']]