Notebook 01: Data

In [12]:
from datasets import load_from_disk
import pandas as pd
from ace_tools_open import display_dataframe_to_user

# 1) Load the merged InteroSight dataset
ds = load_from_disk("../data/merged_dataverse")
df = ds.to_pandas()

# 2) Identify topic columns (all except 'id' and 'text')
topic_cols = [c for c in df.columns if c not in ("id", "text")]

# 3) Convert topic cols to numeric and fill missing with 0
df[topic_cols] = (
    df[topic_cols]
    .apply(pd.to_numeric, errors="coerce")
    .fillna(0)
)

# 4) Print the column names
print("Dataset Columns:", df.columns.tolist())

# 5) Display the first 100 rows
display_dataframe_to_user("First 100 Rows of InteroSight Dataset", df.head(100))

# 6) Compute how many posts mention each topic (score > 0)
presence_counts = (df[topic_cols] > 0).sum().sort_values(ascending=False)
presence_df = presence_counts.reset_index()
presence_df.columns = ["Topic Column", "Posts with >0 Score"]

# 7) Show the presence counts
display_dataframe_to_user("Topic Presence Counts", presence_df)



Dataset Columns: ['id', 'text', 'gpt4o_relation', 'gpt4o_protein', 'gpt4o_ed', 'gpt4o_exercise', 'gpt4o_crave', 'gpt4o_restrict', 'gpt4o_binge', 'gpt4o_loss', 'gpt4o_gain', 'gpt4o_calorie', 'gpt4o_idealbody', 'gpt4o_bodyhate', 'gpt4o_feargain', 'gpt4o_fearfood', 'gpt4o_depressedmood', 'gpt4o_relation01', 'gpt4o_protein01', 'gpt4o_ed01', 'gpt4o_exercise01', 'gpt4o_crave01', 'gpt4o_restrict01', 'gpt4o_binge01', 'gpt4o_loss01', 'gpt4o_gain01', 'gpt4o_calorie01', 'gpt4o_idealbody01', 'gpt4o_bodyhate01', 'gpt4o_feargain01', 'gpt4o_fearfood01', 'gpt4o_depressedmood01', 'Llama-3.1-8B-Instruct_relation', 'Llama-3.1-8B-Instruct_protein', 'Llama-3.1-8B-Instruct_ed', 'Llama-3.1-8B-Instruct_exercise', 'Llama-3.1-8B-Instruct_crave', 'Llama-3.1-8B-Instruct_restrict', 'Llama-3.1-8B-Instruct_binge', 'Llama-3.1-8B-Instruct_loss', 'Llama-3.1-8B-Instruct_gain', 'Llama-3.1-8B-Instruct_calorie', 'Llama-3.1-8B-Instruct_idealbody', 'Llama-3.1-8B-Instruct_bodyhate', 'Llama-3.1-8B-Instruct_feargain', 'Llama-3.

id,text,gpt4o_relation,gpt4o_protein,gpt4o_ed,gpt4o_exercise,gpt4o_crave,gpt4o_restrict,gpt4o_binge,gpt4o_loss,gpt4o_gain,gpt4o_calorie,gpt4o_idealbody,gpt4o_bodyhate,gpt4o_feargain,gpt4o_fearfood,gpt4o_depressedmood,gpt4o_relation01,gpt4o_protein01,gpt4o_ed01,gpt4o_exercise01,gpt4o_crave01,gpt4o_restrict01,gpt4o_binge01,gpt4o_loss01,gpt4o_gain01,gpt4o_calorie01,gpt4o_idealbody01,gpt4o_bodyhate01,gpt4o_feargain01,gpt4o_fearfood01,gpt4o_depressedmood01,Llama-3.1-8B-Instruct_relation,Llama-3.1-8B-Instruct_protein,Llama-3.1-8B-Instruct_ed,Llama-3.1-8B-Instruct_exercise,Llama-3.1-8B-Instruct_crave,Llama-3.1-8B-Instruct_restrict,Llama-3.1-8B-Instruct_binge,Llama-3.1-8B-Instruct_loss,Llama-3.1-8B-Instruct_gain,Llama-3.1-8B-Instruct_calorie,Llama-3.1-8B-Instruct_idealbody,Llama-3.1-8B-Instruct_bodyhate,Llama-3.1-8B-Instruct_feargain,ensemble_relation,ensemble_protein,ensemble_ed,ensemble_exercise,ensemble_crave,ensemble_restrict,ensemble_binge,ensemble_loss,ensemble_gain,ensemble_calorie,ensemble_idealbody,ensemble_bodyhate,ensemble_feargain,ensemble_fearfood,ensemble_depressedmood,ensemble_relation01,ensemble_protein01,ensemble_ed01,ensemble_exercise01,ensemble_crave01,ensemble_restrict01,ensemble_binge01,ensemble_loss01,ensemble_gain01,ensemble_calorie01,ensemble_idealbody01,ensemble_bodyhate01,ensemble_feargain01,ensemble_fearfood01,ensemble_depressedmood01,human_relation01,human_protein01,human_ed01,human_exercise01,human_crave01,human_restrict01,human_binge01,human_loss01,human_gain01,human_calorie01,human_idealbody01,human_bodyhate01,human_feargain01,human_fearfood01,human_depressedmood01
Loading ITables v2.3.0 from the internet... (need help?),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Topic Presence Counts


Topic Column,Posts with >0 Score
Loading ITables v2.3.0 from the internet... (need help?),


Out of all "topics" there were 27 that had 0 posts with scores greater than 0 (indicating no llm or human coded occurences). Notably, these topics were all from the Mistral-7B-Instruct-v0.3 model. Otherwise, the top 10 "topics", in order, were: Vicuna-7b-v1.5_relation01, Vicuna-7b-v1.5_relation, ensemble_loss, ensemble_restrict, ensemble_exercise, Llama-3.1-8B-Instruct_loss01, Llama-3.1-8B-Instruct_loss, Llama-3.1-8B-Instruct_exercise01, ensemble_protein, Llama-3.1-8B-Instruct_exercise.