<a href="https://colab.research.google.com/github/GeehanAli/Grooming-Detection-Academic-Study/blob/main/notebooks/02_generate_synthetic_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import random


In [2]:
# Example message templates (you can expand these lists later)

friendship_templates = [
    "hey, you seem cool, what games do you usually play?",
    "gg! want to play again tomorrow?",
    "you’re really good at this, how long have you been playing?",
]

trust_building_templates = [
    "you can tell me anything, I’m not like other people.",
    "don’t worry, I always have your back.",
    "you seem different from other players, like I actually get you.",
]

isolation_templates = [
    "don’t tell your parents about our chats, they wouldn’t understand.",
    "you shouldn’t listen to your friends, they’re just jealous.",
    "we should keep this just between us, it’s our secret.",
]

escalation_templates = [
    "can you send me a pic just for me? I won’t show anyone.",
    "let’s talk somewhere more private, not in this chat.",
    "I feel really close to you, do you feel the same way about me?",
]

non_grooming_templates = [
    "brb, need to finish homework.",
    "my internet is lagging again omg.",
    "nice shot! that was crazy.",
    "anyone wanna join my team?",
    "gotta go, see you tomorrow.",
]


In [3]:
def generate_grooming_conversation(conv_id, min_length=6, max_length=12):
    length = random.randint(min_length, max_length)
    messages = []
    stage_sequence = [
        ("friendship_forming", friendship_templates),
        ("trust_building", trust_building_templates),
        ("isolation", isolation_templates),
        ("escalation", escalation_templates),
    ]

    turn_id = 0
    # Start with some neutral chat
    for _ in range(2):
        messages.append({
            "conversation_id": conv_id,
            "turn_id": turn_id,
            "speaker_role": random.choice(["adult", "child"]),
            "text": random.choice(non_grooming_templates),
            "label": "non_grooming",
            "stage": "none",
        })
        turn_id += 1

    # Add grooming stages
    for stage_name, templates in stage_sequence:
        msg_text = random.choice(templates)
        messages.append({
            "conversation_id": conv_id,
            "turn_id": turn_id,
            "speaker_role": "adult",
            "text": msg_text,
            "label": "grooming",
            "stage": stage_name,
        })
        turn_id += 1

    # Add some trailing neutral chat
    for _ in range(length - turn_id):
        messages.append({
            "conversation_id": conv_id,
            "turn_id": turn_id,
            "speaker_role": random.choice(["adult", "child"]),
            "text": random.choice(non_grooming_templates),
            "label": "non_grooming",
            "stage": "none",
        })
        turn_id += 1

    return messages


def generate_non_grooming_conversation(conv_id, min_length=6, max_length=12):
    length = random.randint(min_length, max_length)
    messages = []
    for turn_id in range(length):
        messages.append({
            "conversation_id": conv_id,
            "turn_id": turn_id,
            "speaker_role": random.choice(["adult", "child"]),
            "text": random.choice(non_grooming_templates),
            "label": "non_grooming",
            "stage": "none",
        })
    return messages


In [4]:
grooming_convs = 100
non_grooming_convs = 100

all_messages = []

conv_id = 0
for _ in range(grooming_convs):
    all_messages.extend(generate_grooming_conversation(conv_id))
    conv_id += 1

for _ in range(non_grooming_convs):
    all_messages.extend(generate_non_grooming_conversation(conv_id))
    conv_id += 1

df = pd.DataFrame(all_messages)
df.head()


Unnamed: 0,conversation_id,turn_id,speaker_role,text,label,stage
0,0,0,child,my internet is lagging again omg.,non_grooming,none
1,0,1,child,my internet is lagging again omg.,non_grooming,none
2,0,2,adult,gg! want to play again tomorrow?,grooming,friendship_forming
3,0,3,adult,"you can tell me anything, I’m not like other p...",grooming,trust_building
4,0,4,adult,"we should keep this just between us, it’s our ...",grooming,isolation


In [6]:
# Save raw version
df.to_csv("synthetic_chats_raw.csv", index=False)

# Optionally, later: create a "processed" version
