*This notebook applies GPT-4 classification to the Child Objectification Dataset.*

Load libraries

In [None]:
!pip install transformers accelerate bitsandbytes sentencepiece


Load Model


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer # Huggingface Login might be necessary depending on your setup
import torch

# Load model & tokenizer
model_name = "meta-llama/Meta-Llama-3-8B"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)


Load Data

In [None]:
import pandas as pd
import os
import re
from tqdm import tqdm

# Load comments
df = pd.read_excel("/INSERT-DATA-PATH.xlsx")
comments = df["text"].tolist()

tokenizer.padding_side = "left"
model.config.pad_token_id = tokenizer.eos_token_id

# Define prompts
prompt_dict = {
    "neutral_zero": """Extract all comments that describe the physical appearance of the child in the video, including those that consist only of emojis.
    The comments should specifically refer to the child shown and include observations about their appearance. Only include comments that are positive
    or neutral in tone. Ignore any comments that do not reference the child’s physical appearance or visual traits.
    Include comments that can be used to describe appearance. Take not that these are TikTok comments, so be aware of TikTok specific language.
    - **1** = Describes physical appearance or body parts or outfit-related things.
    - **0** = Does not describe physical appearance (e.g., behavior, voice, actions).

    Comment: "{comment}"
    Classification (ONLY return 0 or 1):""",

    "neutral_few": """Extract all comments that describe the physical appearance of the child in the video, including those that consist only of emojis.
    The comments should specifically refer to the child shown and include observations about their appearance. Only include comments that are positive
    or neutral in tone. Ignore any comments that do not reference the child’s physical appearance or visual traits.
    Include comments that can be used to describe appearance. Take not that these are TikTok comments, so be aware of TikTok specific language.
    - **1** = Describes physical appearance or body parts or outfit-related things.
    - **0** = Does not describe physical appearance (e.g., behavior, voice, actions).

    Examples:
    - "She is cute" → 1
    - "I keep wondering who he looks like. He looks like Harry Potter." → 1
    - "The way she says ball is so cute!" → 0

    Comment: "{comment}"
    Classification (ONLY return 0 or 1):""",

    "negative_zero": """Identify comments that describe the physical appearance of the child in the video, including those consisting only of emojis.
    Focus on language that may be objectifying, overly personal, possessive, sexualizing, diminishing, or otherwise inappropriate in tone.
    Take not that these are TikTok comments, so be aware of TikTok specific language. Exclude comments that are unrelated to the child.
    - **1** = Contains objectifying or inappropriate language.
    - **0** = Does not.

    Comment: "{comment}"
    Classification (ONLY return 0 or 1):""",

    "negative_few": """Identify comments that describe the physical appearance of the child in the video, including those consisting only of emojis.
    Focus on language that may be objectifying, overly personal, possessive, sexualizing, diminishing, or otherwise inappropriate in tone.
    Take not that these are TikTok comments, so be aware of TikTok specific language. Exclude comments that are unrelated to the child.
    - **1** = Contains objectifying or inappropriate language.
    - **0** = Does not.

    Examples:
    - "What a cute little princess" → 1
    - "I Love You . SO Much.@" → 1
    - "She looks adorable today!" → 0

    Comment: "{comment}"
    Classification (ONLY return 0 or 1):"""
}

# Define batch classification function
def classify_batch(comments, prompt):
    """Formats the prompt, generates output, and extracts clean 0 or 1 classifications."""
    input_texts = [prompt.format(comment=c) for c in comments]
    inputs = tokenizer(input_texts, return_tensors="pt", truncation=True, padding=True, max_length=1024).to("cuda")

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=2, do_sample=False, temperature=0.1, top_k=50)

    decoded_outputs = [tokenizer.decode(o, skip_special_tokens=True).strip() for o in outputs]

    # Print debug info to check the final classification extraction
    print("\n--- DEBUG: Model Outputs ---")
    for i in range(min(5, len(decoded_outputs))):
        print(f"Input: {comments[i]}")
        print(f"Model Output: {decoded_outputs[i]}")
        print("-" * 30)

    # Extract the last number in the output - this is our classification result
    classifications = [
        re.findall(r"[01]", d)[-1] if re.findall(r"[01]", d) else "Error"
        for d in decoded_outputs
    ]

    return classifications


# Create output directory
output_dir = "/INSERT-PATH"
os.makedirs(output_dir, exist_ok=True)

batch_size = 32

# Run classification
for name, prompt in prompt_dict.items():
    print(f"Processing '{name}' prompt...")

    results = []
    with tqdm(total=len(comments), desc=f"Processing {name}") as pbar:
        for i in range(0, len(comments), batch_size):
            batch = comments[i : i + batch_size]
            classifications = classify_batch(batch, prompt)
            results.extend([{"comment": c, "model": model_name, "classification": cls} for c, cls in zip(batch, classifications)])

            pbar.update(len(batch))

    # Convert to DataFrame and save
    df_results = pd.DataFrame(results)
    output_file = os.path.join(output_dir, f"{name}_{model_name.replace('/', '_')}.xlsx")
    df_results.to_excel(output_file, index=False)

