*This notebook applies Mistral classification to the Child Objectification Dataset.*

Install Mistral Inference and Load Model

In [None]:
!pip install mistral_inference


In [None]:
from huggingface_hub import snapshot_download # Huggingface Login might be necessary depending on your setup
from pathlib import Path

# Define local model path
mistral_models_path = Path.home().joinpath('mistral_models', '7B-Instruct-v0.3')
mistral_models_path.mkdir(parents=True, exist_ok=True)

# Download necessary model files
snapshot_download(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    allow_patterns=["params.json", "consolidated.safetensors", "tokenizer.model.v3"],
    local_dir=mistral_models_path
)


MULTI PROMPT SETUP

In [None]:
import pandas as pd
import os
import re
from tqdm import tqdm

from mistral_inference.transformer import Transformer
from mistral_inference.generate import generate
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest

# Load Mistral Model
mistral_models_path = "/root/mistral_models/7B-Instruct-v0.3"
tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
model = Transformer.from_folder(mistral_models_path)

# Load comments
df = pd.read_excel("/INSERT-DATA-PATH.xlsx")
comments = df["text"].tolist()

# Define classification prompts
prompt_dict = {
    "neutral_zero": """Extract all comments that describe the physical appearance of the child in the video, including those that consist only of emojis.
    The comments should specifically refer to the child shown and include observations about their appearance. Only include comments that are positive
    or neutral in tone. Ignore any comments that do not reference the child’s physical appearance or visual traits.
    Include comments that can be used to describe appearance. Take note that these are TikTok comments, so be aware of TikTok-specific language.
    - **1** = Describes physical appearance or body parts or outfit-related things.
    - **0** = Does not describe physical appearance (e.g., behavior, voice, actions).

    Comment: "{comment}"
    Classification (ONLY return 0 or 1):""",

    "neutral_few": """Extract all comments that describe the physical appearance of the child in the video, including those that consist only of emojis.
    The comments should specifically refer to the child shown and include observations about their appearance. Only include comments that are positive
    or neutral in tone. Ignore any comments that do not reference the child’s physical appearance or visual traits.
    Include comments that can be used to describe appearance. Take note that these are TikTok comments, so be aware of TikTok-specific language.
    - **1** = Describes physical appearance or body parts or outfit-related things.
    - **0** = Does not describe physical appearance (e.g., behavior, voice, actions).

    Examples:
    - "She is cute" → 1
    - "I keep wondering who he looks like. He looks like Harry Potter." → 1
    - "The way she says ball is so cute!" → 0

    Comment: "{comment}"
    Classification (ONLY return 0 or 1):""",

    "negative_zero": """Identify comments that describe the physical appearance of the child in the video, including those consisting only of emojis.
    Focus on language that may be objectifying, overly personal, possessive, sexualizing, diminishing, or otherwise inappropriate in tone.
    Take note that these are TikTok comments, so be aware of TikTok-specific language. Exclude comments that are unrelated to the child.
    - **1** = Contains objectifying or inappropriate language.
    - **0** = Does not.

    Comment: "{comment}"
    Classification (ONLY return 0 or 1):""",

    "negative_few": """Identify comments that describe the physical appearance of the child in the video, including those consisting only of emojis.
    Focus on language that may be objectifying, overly personal, possessive, sexualizing, diminishing, or otherwise inappropriate in tone.
    Take note that these are TikTok comments, so be aware of TikTok-specific language. Exclude comments that are unrelated to the child.
    - **1** = Contains objectifying or inappropriate language.
    - **0** = Does not.

    Examples:
    - "What a cute little princess" → 1
    - "I Love You . SO Much.@" → 1
    - "She looks adorable today!" → 0

    Comment: "{comment}"
    Classification (ONLY return 0 or 1):"""
}

# Single-Comment Classification Function
def classify_comment(comment, prompt):
    """Classifies a single comment using Mistral."""

    input_text = prompt.format(comment=comment)
    request = ChatCompletionRequest(messages=[UserMessage(content=input_text)])

    # Tokenize input
    tokens = tokenizer.encode_chat_completion(request).tokens

    # Generate output
    out_tokens, _ = generate(
        [tokens], model, max_tokens=2, temperature=0.1,
        eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id
    )

    # Decode output
    decoded_output = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0]).strip()

    # Extract classification (only last 0/1)
    classification = re.findall(r"[01]", decoded_output)
    return classification[-1] if classification else "Error"

# Create output directory
output_dir = "/INSERT-PATH"
os.makedirs(output_dir, exist_ok=True)

# un Classification for All Prompts
for name, prompt in prompt_dict.items():
    print(f"\n Processing '{name}' prompt...")

    results = []
    for idx, comment in tqdm(enumerate(comments), total=len(comments), desc=f"Processing {name}"):
        classification = classify_comment(comment, prompt)
        results.append({"comment": comment, "classification": classification})

        # Print first 10 results for debugging
        if idx < 10:
            print(f"Comment: {comment}")
            print(f"Classification: {classification}")
            print("-" * 30)

    # Convert to DataFrame and save
    df_results = pd.DataFrame(results)
    output_file = os.path.join(output_dir, f"{name}_Mistral.xlsx")
    df_results.to_excel(output_file, index=False)

