# 02_prepare_training_data

In this notebook we:
1. Load the merged dataset from `data/merged_dataverse`  
2. Compute a `topics_list` column for each post  
3. Filter out any posts with no topics  
4. Preview the resulting DataFrame  

This prepares our data for instruction–response pair generation.


In [None]:
# Cell 1: imports & load
from datasets import load_from_disk
import pandas as pd
from pathlib import Path

# 1) Load merged dataset
ROOT = Path("..")
ds = load_from_disk(ROOT / "data" / "merged_dataverse")
df = ds.to_pandas()

# 2) Identify all topic columns
topic_cols = [c for c in df.columns if c not in ("id", "text")]

# 3) Build topics_list for each row
df["topics_list"] = (
    df[topic_cols]
    .apply(lambda row: [col for col, v in row.items() if isinstance(v, (int, float)) and v > 0], axis=1)
)

# 4) Filter out posts with no topics
df = df[df["topics_list"].map(len) > 0].reset_index(drop=True)

# 5) Preview
print(f"Total training examples: {len(df)}")
df.head(5)


TypeError: '>' not supported between instances of 'str' and 'int'