In [1]:
import json
import random
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.notebook import trange, tqdm
from datetime import datetime
from uuid import uuid4
import ollama


In [2]:
pip install ollama


Note: you may need to restart the kernel to use updated packages.


In [4]:

# Load LLaMA model
def load_llama_model():
    """Load LLaMA model and tokenizer"""
    model_name = "/home/g4/Llama-3.2-3B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    tokenizer.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
    model.config.pad_token_id = model.config.pad_token_id or model.config.eos_token_id

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    return model.to(device), tokenizer, device

#  response = ollama.chat(
#         model='deepseek-r1:14b',
#         messages=[{
#             'role': 'system',
#             'content': (
#                 "Answer ONLY 'yes' or 'no' to this question about thought patterns. "
#                 "Use the following definitions to be able to reason with them: All or Nothing Thinking/Polarized Thinking, "
#                 "Fortune telling (catastrophizing), Emotional reasoning, Labeling/Global Labeling, Mental Filter, Mind reading, "
#                 "Overgeneralization, Personalization, Should statements, Blaming, What if?, Discounting the positive, "
#                 "Magnification/minimization, and Jumping to conclusions."
#             )
#         }, {
#             'role': 'user',
#             'content': f"Thought: {thought}\nQuestion: {current_node['question']}"
#         }],
#         options={'temperature': 0}
#     )




In [5]:
# Define possible "seed" attributes for scenario generation
model, tokenizer, device = load_llama_model()

sex_options = ["Male", "Female"]
ages = list(range(18, 80))
occupations = ["Teacher", "Software Engineer", "Doctor", "Nurse", "Artist", "Retail Worker", "Scientist", "Chef", "Police Officer", "Journalist"]
relationship_statuses = ["Single", "Married", "Divorced", "Widowed", "In a Relationship"]
negative_emotions = [
    "Anger", "Anxiety", "Bitterness", "Contempt", "Despair", "Disappointment", "Disgust",
    "Embarrassment", "Envy", "Fear", "Frustration", "Grief", "Guilt", "Hatred", "Helplessness",
    "Hopelessness", "Humiliation", "Insecurity", "Irritation", "Jealousy", "Loneliness",
    "Melancholy", "Misery", "Neglect", "Panic", "Paranoia", "Rage", "Regret", "Rejection",
    "Remorse", "Resentment", "Sadness", "Shame", "Sorrow", "Spite", "Stress", "Suffering",
    "Unhappiness", "Vengefulness", "Worry"
]
# Variants of writing style for diversity
tones = ["angry", "calm", "frustrated", "sarcastic", "venting", "blunt", "sad", "anxious", "direct"]
structures = ["formatted", "scattered", "nuanced", "blunt", "sarcastic"]
intensities = ["mild", "intense", "slightly annoyed", "highly emotional", "casual"]
use_of_language = ["use cuss words", "avoid cuss words", "use informal language", "use casual language"]

system_message = """
You are an AI assistant trained to generate realistic triggering events based on human demographic and emotional "seeds."
Each event should reflect the individual's characteristics and emotional state subtly, creating a compelling and lifelike narrative.
Ensure variability in writing style, structure, and intensity. Do not directly state that the person is experiencing a cognitive distortion.
"""


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using device: cuda


In [6]:


def generate_response(text: str, count) -> str:
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": text},
    ]
   
    inputs = tokenizer.apply_chat_template(messages, tokenize=True, return_dict=True, add_generation_prompt=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model.generate(
        **inputs,
        max_new_tokens=count,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    return tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):])
    # response = ollama.chat(
    #     model="mistral-nemo",
    #     messages=[{
    #         'role': 'system',
    #         'content': (system_message)
    #     }, {
    #         'role': 'user',
    #         'content': (text)
    #     }],
    #     options={'temperature': 0.7, 'top_p':0.9, 'repetition_penalty':1.1, 'do_sample':True, 'max_new_tokens': count}
    # )

    # response_text = response['message']['content'].strip()

    # return response_text




In [7]:
# Generate a random "seed"
def generate_random_seed():
    return {
        "Sex": random.choice(sex_options),
        "Age": random.choice(ages),
        "Occupation": random.choice(occupations),
        "Relationship Status": random.choice(relationship_statuses),
        "Negative Emotion": random.choice(negative_emotions)
    }



In [8]:
def generate_random_structure():
    return {
        "Tone" : random.choice(tones),
        "Structure" : random.choice(structures),
        "Intensity" : random.choice(intensities),
        "Language" : random.choice(use_of_language) 
    }


In [7]:
# Generate a writing prompt based on the demographic "seed"
def generate_prompt(seed):
    user_message = (
        f"Generate a triggering event scenario for a person with the following characteristics:\n"
        f"- Sex: {seed['Sex']}\n"
        f"- Age: {seed['Age']}\n"
        f"- Occupation: {seed['Occupation']}\n"
        f"- Relationship Status: {seed['Relationship Status']}\n"
        f"- Experiencing the emotion: {seed['Negative Emotion']}\n\n"
        f"The scenario should be written in a {random.choice(tones)} tone, with a {random.choice(structures)} structure, "
        f"expressing a {random.choice(intensities)} level of intensity, and {random.choice(use_of_language)}. "
        f"The response should be a self-contained short story reflecting the triggering event. You should output the event only without any commentary."
    )
   
    return generate_response(user_message, 800).strip()



In [8]:
# Generate a writing prompt based on the demographic "seed"
def generate_prompt(seed, struct):
    user_message = (
        f"Generate a concise triggering event scenario for a person with the following characteristics:\n"
        f"- Sex: {seed['Sex']}\n"
        f"- Age: {seed['Age']}\n"
        f"- Occupation: {seed['Occupation']}\n"
        f"- Relationship Status: {seed['Relationship Status']}\n"
        f"- Experiencing the emotion: {seed['Negative Emotion']}\n\n"
        f"Write a self-contained short realistic scenarios that clearly describes a realistic triggering event. Do not use full names."
        f"Use a {struct['Tone']} tone, a {struct['Structure']} structure, and a {struct['Intensity']} level of intensity, with {struct['Language']} language. Avoid overly theatrical narration or dramatic embellishments—focus solely on the event itself without additional commentary."
    )
   
    return generate_response(user_message, 400).strip()



WE USE THE CELL BELOW FOR GENERATING PROMPTS

In [9]:
# Generate a writing prompt based on the demographic "seed"
def generate_prompt(seed, struct):
    user_message = (
        f"Generate a concise triggering event scenario for a person with the following characteristics:\n"
        f"- Sex: {seed['Sex']}\n"
        f"- Age: {seed['Age']}\n"
        f"- Occupation: {seed['Occupation']}\n"
        f"- Relationship Status: {seed['Relationship Status']}\n"
        f"- Experiencing the emotion: {seed['Negative Emotion']}\n\n"
        f"Write a self-contained short realistic scenarios that clearly describes a realistic triggering event. Do not use full names."
        f"Avoid theatrical narration or dramatic embellishments—focus solely on the event itself without additional commentary just provide the scenario."
    )
   
    return generate_response(user_message, 400).strip()



In [None]:
# Generate data
output_dir = "/home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/new_800"
os.makedirs(output_dir, exist_ok=True)

num_samples = 8000  # Total samples to generate
batch_size = 10  # Process in batches for efficiency

generated_data = []
for i in trange(num_samples // batch_size, desc="Generating Data"):
    batch_data = []
    for _ in range(batch_size):
        seed = generate_random_seed()
        struct = generate_random_structure()
        try:
            generated_prompt = generate_prompt(seed, struct)
            batch_data.append({
                "Sex": seed["Sex"],
                "Age": seed["Age"],
                "Occupation": seed["Occupation"],
                "Relationship Status": seed["Relationship Status"],
                "Negative Emotion": seed["Negative Emotion"],
                "Generated Scenario": generated_prompt,
                # "Tone" : struct['Tone'],
                # "Structure" : struct['Structure'],
                # "Intensity" : struct['Intensity'],
                # "Language" : struct['Language']
            })
        except Exception as e:
            print(f"Error generating scenario: {e}")

    generated_data.extend(batch_data)

    # Save periodically to avoid data loss
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    file_name = f"{output_dir}/{timestamp}_{uuid4().hex}.json"
    with open(file_name, 'w') as f:
        json.dump(batch_data, f, indent=4)
    print(f"Saved {len(batch_data)} samples to {file_name}")

Generating Data:   0%|          | 0/800 [00:00<?, ?it/s]

Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/new_800/2025-05-06_17-35-42_36c4f24167c94f9bb975b47c90a00d37.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/new_800/2025-05-06_17-36-23_0b88f3a0e99b495f98d84cd5b893737d.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/new_800/2025-05-06_17-37-07_16e32d90c5f54cfa9fc47cd62050f442.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/new_800/2025-05-06_17-37-54_04f0d8118d074100873856012594324c.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/new_800/2025-05-06_17-38-42_0f84fe7177f7478491e8ef3c2dae9de7.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_llama_new/new_800/2025-05-06_17-39-35_ad6c896aada04fc1aa446b0d67caa6c7.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigge

: 

In [None]:
# Generate data
output_dir = "/home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_nemo"
os.makedirs(output_dir, exist_ok=True)

num_samples = 2000  # Total samples to generate
batch_size = 10  # Process in batches for efficiency

generated_data = []
for i in trange(num_samples // batch_size, desc="Generating Data"):
    batch_data = []
    for _ in range(batch_size):
        seed = generate_random_seed()
        try:
            generated_prompt = generate_prompt(seed)
            batch_data.append({
                "Sex": seed["Sex"],
                "Age": seed["Age"],
                "Occupation": seed["Occupation"],
                "Relationship Status": seed["Relationship Status"],
                "Negative Emotion": seed["Negative Emotion"],
                "Generated Scenario": generated_prompt
            })
        except Exception as e:
            print(f"Error generating scenario: {e}")

    generated_data.extend(batch_data)

    # Save periodically to avoid data loss
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    file_name = f"{output_dir}/{timestamp}_{uuid4().hex}.json"
    with open(file_name, 'w') as f:
        json.dump(batch_data, f, indent=4)
    print(f"Saved {len(batch_data)} samples to {file_name}")

Generating Data:   0%|          | 0/200 [00:00<?, ?it/s]

Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_nemo/2025-03-20_14-17-33_9a827b1a24204e3db6e969e1b3e25e7a.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_nemo/2025-03-20_14-18-16_fdbefd9d86e74dd38c4e260292817e49.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_nemo/2025-03-20_14-19-16_a7f976bb9bfa4030a5a6a3e34a2a15cf.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_nemo/2025-03-20_14-20-04_a933f0b713a3444b9cf53965fab73c04.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_nemo/2025-03-20_14-20-50_a7d9ceccaeb44516a5ae1726e0eb897b.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_nemo/2025-03-20_14-21-33_7aa99acee1af40cebe9ec60c01685217.json
Saved 10 samples to /home/g4/Mindwell/DataGenerationCBTApproach/trigger_events_nemo/2025-03-20_14-22-12_75f47da33a2047a3a8b3b33213467854.json
Saved 

: 