# Basic Characterization of BC Datasets
This notebook reads three Parquet datasets and summarizes their structure.

In [1]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

def read_parquet_local(path: str) -> pd.DataFrame:
    try:
        return pd.read_parquet(path)
    except Exception as e:
        try:
            import fastparquet
            return pd.read_parquet(path, engine='fastparquet')
        except Exception:
            try:
                import pyarrow.parquet as pq
                table = pq.read_table(path)
                return table.to_pandas()
            except Exception as e2:
                raise RuntimeError(f"Failed to read parquet file {path}: {e2}")

def human_bytes(n: int) -> str:
    for unit in ['B','KB','MB','GB','TB']:
        if n < 1024:
            return f"{n:.2f} {unit}"
        n /= 1024
    return f"{n:.2f} PB"

def characterize_df(name: str, df: pd.DataFrame, max_cols: int = 40):
    print(f"\n=== Dataset: {name} ===")
    print(f"Rows: {len(df):,}, Columns: {df.shape[1]:,}")
    mem = df.memory_usage(deep=True).sum()
    print(f"Approx. memory (deep): {human_bytes(mem)}")
    cols = list(df.columns)
    print("Columns (truncated):", cols[:max_cols])
    print("\nDtypes:")
    display(df.dtypes.to_frame('dtype'))
    # print("\nMissing values (top 20):")
    # display(df.isna().sum().sort_values(ascending=False).head(20).to_frame('missing'))
    # print("\nUnique values per column (top 20):")
    # display(df.nunique(dropna=True).sort_values(ascending=False).head(20).to_frame('nunique'))
    num_df = df.select_dtypes(include=[np.number])
    if not num_df.empty:
        print("\nNumeric summary:")
        display(num_df.describe().T)
        try:
            n_cols = min(len(num_df.columns), 12)
            if n_cols > 0 and len(df) >= 10:
                num_df.iloc[:, :n_cols].hist(figsize=(min(3*n_cols, 24), 12), bins=30)
                plt.suptitle(f"{name} - Histograms (first {n_cols} numeric columns)")
                plt.tight_layout()
                plt.show()
        except Exception as e:
            print("Histogram plotting skipped:", e)
    print("\nSample rows:")
    display(df.head(5))


In [3]:
BCPLUS = "/opt/tiger/verl_context_folding/bc+debug.parquet"
BC_TRAIN = "/opt/tiger/verl_context_folding/bc_train_mh.parquet"
BC_TEST = "/opt/tiger/verl_context_folding/bc_test_emh.parquet"

for p in [BCPLUS, BC_TRAIN, BC_TEST]:
    assert os.path.exists(p), f"File not found: {p}"

df_plus = read_parquet_local(BCPLUS)
df_train = read_parquet_local(BC_TRAIN)
df_test = read_parquet_local(BC_TEST)
print("Loaded:", { 'bc+debug': df_plus.shape, 'bc_train_mh': df_train.shape, 'bc_test_emh': df_test.shape })


Loaded: {'bc+debug': (830, 6), 'bc_train_mh': (680, 6), 'bc_test_emh': (150, 6)}


In [6]:
characterize_df('BCPLUS (bc+debug.parquet)', df_plus)
characterize_df('BC_TRAIN (bc_train_mh.parquet)', df_train)
characterize_df('BC_TEST (bc_test_emh.parquet)', df_test)



=== Dataset: BCPLUS (bc+debug.parquet) ===
Rows: 830, Columns: 6
Approx. memory (deep): 654.47 KB
Columns (truncated): ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info', 'answer']

Dtypes:


Unnamed: 0,dtype
data_source,object
prompt,object
ability,object
reward_model,object
extra_info,object
answer,object



Sample rows:


Unnamed: 0,data_source,prompt,ability,reward_model,extra_info,answer
0,browsecomp-plus,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Queen Arwa University', 'evidence_...",Queen Arwa University
1,browsecomp-plus,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Laura Lojo-Rodriguez', 'evidence_d...",Laura Lojo-Rodriguez
2,browsecomp-plus,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Vakkorama', 'evidence_docs': [{'do...",Vakkorama
3,browsecomp-plus,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Secretary', 'evidence_docs': [{'do...",Secretary
4,browsecomp-plus,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Red', 'evidence_docs': [{'docid': ...",Red



=== Dataset: BC_TRAIN (bc_train_mh.parquet) ===
Rows: 680, Columns: 6
Approx. memory (deep): 536.02 KB
Columns (truncated): ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info', 'answer']

Dtypes:


Unnamed: 0,dtype
data_source,object
prompt,object
ability,object
reward_model,object
extra_info,object
answer,object



Sample rows:


Unnamed: 0,data_source,prompt,ability,reward_model,extra_info,answer
0,bc_train_meduim,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Queen Arwa University', 'evidence_...",Queen Arwa University
1,bc_train_hard,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Laura Lojo-Rodriguez', 'evidence_d...",Laura Lojo-Rodriguez
2,bc_train_meduim,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Vakkorama', 'evidence_docs': [{'do...",Vakkorama
3,bc_train_hard,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Secretary', 'evidence_docs': [{'do...",Secretary
4,bc_train_meduim,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Red', 'evidence_docs': [{'docid': ...",Red



=== Dataset: BC_TEST (bc_test_emh.parquet) ===
Rows: 150, Columns: 6
Approx. memory (deep): 117.83 KB
Columns (truncated): ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info', 'answer']

Dtypes:


Unnamed: 0,dtype
data_source,object
prompt,object
ability,object
reward_model,object
extra_info,object
answer,object



Sample rows:


Unnamed: 0,data_source,prompt,ability,reward_model,extra_info,answer
0,bc_test_meduim,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}",{'answer': 'Emmanuel Kwesi Danso Arthur Junior...,Emmanuel Kwesi Danso Arthur Junior
1,bc_test_easy,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Lebo', 'evidence_docs': [{'docid':...",Lebo
2,bc_test_easy,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Irving', 'evidence_docs': [{'docid...",Irving
3,bc_test_easy,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'One Red Rose', 'evidence_docs': [{...",One Red Rose
4,bc_test_easy,[{'content': 'You are an expert research agent...,LocalSearch,"{'ground_truth': None, 'style': 'agent_env'}","{'answer': 'Rudy Cox', 'evidence_docs': [{'doc...",Rudy Cox


Notes:
- Reading Parquet requires either `pyarrow` or `fastparquet`.
- If imports fail, install with `pip install pyarrow fastparquet`.

In [4]:
df_plus.iloc[0]['extra_info']['prompt']==df_plus.iloc[0]['prompt']

array([ True,  True])