In [1]:
import json
import random
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.notebook import trange, tqdm
from datetime import datetime
from uuid import uuid4
import ollama


In [None]:
pip install ollama


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [9]:

# # Load LLaMA model
def load_llama_model():
    """Load LLaMA model and tokenizer"""
    model_name = "/home/g4/Llama-3.2-3B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    tokenizer.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
    model.config.pad_token_id = model.config.pad_token_id or model.config.eos_token_id

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    return model.to(device), tokenizer, device

#  response = ollama.chat(
#         model='deepseek-r1:14b',
#         messages=[{
#             'role': 'system',
#             'content': (
#                 "Answer ONLY 'yes' or 'no' to this question about thought patterns. "
#                 "Use the following definitions to be able to reason with them: All or Nothing Thinking/Polarized Thinking, "
#                 "Fortune telling (catastrophizing), Emotional reasoning, Labeling/Global Labeling, Mental Filter, Mind reading, "
#                 "Overgeneralization, Personalization, Should statements, Blaming, What if?, Discounting the positive, "
#                 "Magnification/minimization, and Jumping to conclusions."
#             )
#         }, {
#             'role': 'user',
#             'content': f"Thought: {thought}\nQuestion: {current_node['question']}"
#         }],
#         options={'temperature': 0}
#     )




In [10]:
# Define possible "seed" attributes for scenario generation
model, tokenizer, device = load_llama_model()

sex_options = ["Male", "Female"]
ages = list(range(18, 80))
occupations = ["Teacher", "Software Engineer", "Doctor", "Nurse", "Artist", "Retail Worker", "Scientist", "Chef", "Police Officer", "Journalist"]
relationship_statuses = ["Single", "Married", "Divorced", "Widowed", "In a Relationship"]
negative_emotions = [
    "Anger", "Anxiety", "Bitterness", "Contempt", "Despair", "Disappointment", "Disgust",
    "Embarrassment", "Envy", "Fear", "Frustration", "Grief", "Guilt", "Hatred", "Helplessness",
    "Hopelessness", "Humiliation", "Insecurity", "Irritation", "Jealousy", "Loneliness",
    "Melancholy", "Misery", "Neglect", "Panic", "Paranoia", "Rage", "Regret", "Rejection",
    "Remorse", "Resentment", "Sadness", "Shame", "Sorrow", "Spite", "Stress", "Suffering",
    "Unhappiness", "Vengefulness", "Worry"
]
# Variants of writing style for diversity
tones = ["angry", "calm", "frustrated", "sarcastic", "venting", "blunt", "sad", "anxious", "direct"]
structures = ["structured", "scattered", "nuanced", "blunt", "sarcastic"]
intensities = ["mild", "intense", "slightly annoyed", "highly emotional", "casual"]
use_of_language = ["use cuss words", "avoid cuss words", "use formal language", "use casual language"]

system_message = """
You are an AI assistant trained to generate realistic triggering events based on human demographic and emotional "seeds."
Each event should reflect the individual's characteristics and emotional state subtly, creating a compelling and lifelike narrative.
Ensure variability in writing style, structure, and intensity. Do not directly state that the person is experiencing a cognitive distortion.
"""


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using device: cuda


In [11]:


def generate_response(text: str, count) -> str:
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": text},
    ]
   
    inputs = tokenizer.apply_chat_template(messages, tokenize=True, return_dict=True, add_generation_prompt=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model.generate(
        **inputs,
        max_new_tokens=count,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    return tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):])
    # response = ollama.chat(
    #     model="llama3.2:3b-instruct-fp16",
    #     messages=[{
    #         'role': 'system',
    #         'content': (system_message)
    #     }, {
    #         'role': 'user',
    #         'content': (text)
    #     }],
    #     options={'temperature': 0.7, 'top_p':0.9, 'repetition_penalty':1.1, 'do_sample':True, 'max_new_tokens': count}
    # )

    # response_text = response['message']['content'].strip()

    # return response_text




In [4]:
# Generate a random "seed"
def generate_random_seed():
    return {
        "Sex": random.choice(sex_options),
        "Age": random.choice(ages),
        "Occupation": random.choice(occupations),
        "Relationship Status": random.choice(relationship_statuses),
        "Negative Emotion": random.choice(negative_emotions)
    }



In [5]:
# Generate a writing prompt based on the demographic "seed"
def generate_prompt(seed):
    user_message = (
        f"Generate a triggering event scenario for a person with the following characteristics:\n"
        f"- Sex: {seed['Sex']}\n"
        f"- Age: {seed['Age']}\n"
        f"- Occupation: {seed['Occupation']}\n"
        f"- Relationship Status: {seed['Relationship Status']}\n"
        f"- Experiencing the emotion: {seed['Negative Emotion']}\n\n"
        f"The scenario should be written in a {random.choice(tones)} tone, with a {random.choice(structures)} structure, "
        f"expressing a {random.choice(intensities)} level of intensity, and {random.choice(use_of_language)}. "
        f"The response should be a self-contained short story reflecting the triggering event. You should output the event only without any commentary."
    )
   
    return generate_response(user_message, 800).strip()



In [12]:
# Generate data
output_dir = "/home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new"
os.makedirs(output_dir, exist_ok=True)

num_samples = 2000  # Total samples to generate
batch_size = 10  # Process in batches for efficiency

generated_data = []
for i in trange(num_samples // batch_size, desc="Generating Data"):
    batch_data = []
    for _ in range(batch_size):
        seed = generate_random_seed()
        try:
            generated_prompt = generate_prompt(seed)
            batch_data.append({
                "Sex": seed["Sex"],
                "Age": seed["Age"],
                "Occupation": seed["Occupation"],
                "Relationship Status": seed["Relationship Status"],
                "Negative Emotion": seed["Negative Emotion"],
                "Generated Scenario": generated_prompt
            })
        except Exception as e:
            print(f"Error generating scenario: {e}")

    generated_data.extend(batch_data)

    # Save periodically to avoid data loss
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    file_name = f"{output_dir}/{timestamp}_{uuid4().hex}.json"
    with open(file_name, 'w') as f:
        json.dump(batch_data, f, indent=4)
    print(f"Saved {len(batch_data)} samples to {file_name}")

Generating Data:   0%|          | 0/200 [00:00<?, ?it/s]

Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/2025-04-19_17-53-17_a7486593f0c54dfdb8ed2675a267c41e.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/2025-04-19_17-54-53_3cee485380d142c19948ddef2b9a7f5c.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/2025-04-19_17-56-31_38edc579c0a741da8ccc6df953d06215.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/2025-04-19_17-58-02_63d8b329b74e447fb17cc5936840ed42.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/2025-04-19_17-59-50_6da3c54c1df949f49fe46fce556154dc.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/2025-04-19_18-01-24_1eef47326a114088aef7c7f8220b8d85.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/2025-04-19_18-02-59_dc97695dc