In [None]:
import json
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm.notebook import trange, tqdm
from datetime import datetime

In [None]:
def load_llama_model():
    """Load LLaMA model and tokenizer"""
    model_name = "/home/g4/Llama-3.2-3B-Instruct"
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    if model.config.pad_token_id is None:
        model.config.pad_token_id = model.config.eos_token_id

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    model = model.to(device)


    return model, tokenizer, device

model, tokenizer, device = load_llama_model()

In [None]:
cognitive_distortions = {
    "All or Nothing Thinking": "I view a situation, a person or an event in “either-or” terms, fitting them into only two extreme categories instead of on a continuum.",
    "Fortune Telling": "I predict the future in negative terms and believe that what will happen will be so awful that I will not be able to stand it.",
    "Emotional Reasoning": "I believe my emotions reflect reality and let them guide my attitudes and judgments.",
    "Labeling/Global Labeling": "I put a fixed, usually negative, global label on myself or others.",
    "Mental Filter": "I pay attention to one or a few details and fail to see the whole picture.",
    "Mind Reading": "I believe that I know the thoughts or intentions of others (or that they know my thoughts or intentions) without sufficient evidence.",
    "Overgeneralization": "I take isolated negative cases and generalize them, using words like “always,” “never,” “whole,” “entire,” etc.",
    "Personalization": "I assume that others’ behaviors and external events concern myself without considering other plausible explanations.",
    "Should Statements": "I tell myself that events, people’s behaviors, and my own attitudes “should” be the way I expected, not as they are.",
    "Blaming": "I direct blame to others for my negative feelings or take responsibility for others' behaviors and attitudes.",
    "What if?": "I keep asking questions like “what if something happens?” focusing on negative outcomes.",
    "Discounting the Positive": "I disqualify positive experiences or events, insisting that they don’t count.",
    "Magnification/Minimization": "I emphasize the negatives or downplay positives in myself, others, or situations.",
    "Jumping to Conclusions": "I draw conclusions from little or no confirmatory evidence.",
    "Unfair Comparisons": "I compare myself with others who seem better and place myself at a disadvantage."
}

# Variants of tone, style, intensity, and other characteristics for diverse responses
tones = ["angry", "calm", "frustrated", "sarcastic", "venting", "blunt", "sad", "anxious", "direct"]
structures = ["structured", "scattered", "nuanced", "blunt", "sarcastic"]
intensities = ["mild", "intense", "slightly annoyed", "highly emotional", "casual"]
use_of_language = ["use cuss words", "avoid cuss words", "use formal language", "use casual language"]

system_message = """
You are an expert assistant specializing in identifying and generating text examples that reflect common cognitive distortions.
Your task is to generate writing prompts and then create text from the perspective of people experiencing specific cognitive distortions without being aware of it.
Each time you are asked to generate text, remember that the person should not be aware of their cognitive distortion.
Make sure the situation, writing style, and personality of the person are different for each response.
The final text should demonstrate the cognitive distortion in a subtle, human, realistic way.
"""

In [None]:
def generate_response(text: str, count) -> str:
    messages = [
    {
        "role": "system",
        "content": system_message,
    },
    {"role": "user", "content": text},
 ]
    inputs = tokenizer.apply_chat_template(messages, tokenize=True, return_dict=True, add_generation_prompt=True, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model.generate(
        **inputs,
        max_new_tokens=count,
        temperature=0.1,
        top_p=0.9,
        repetition_penalty=1.1,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True,
    )
    return tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):])

In [None]:
def generate_prompt(cognitive_distortion):
    # Look up the definition for the specific cognitive distortion
    definition = cognitive_distortions[cognitive_distortion]
    
    # Randomly choose characteristics for varied human-like writing styles
    chosen_tone = random.choice(tones)
    chosen_structure = random.choice(structures)
    chosen_intensity = random.choice(intensities)
    chosen_language = random.choice(use_of_language)

    user_message = f"Generate a writing prompt for the cognitive distortion '{cognitive_distortion}'. The definition of this cognitive distortion is: '{definition}'. Write as if the person experiencing this distortion has a {chosen_tone} tone, with a {chosen_structure} structure, expressing a {chosen_intensity} level of intensity, and {chosen_language}. The prompt should ask Llama 3.2 to write from the perspective of someone experiencing this distortion, and the person should not be aware of it. DO NOT PROVIDE AN EXAMPLE! ONLY RESPOND WITH THE PROMPT, DO NOT ADD ANY EXTRA CONTEXT!"
    
    return generate_response(user_message, 1000)


def generate_text_from_prompt(prompt, cognitive_distortion):
    user_message = f"Using the following writing prompt, generate one example from the perspective of someone experiencing the cognitive distortion '{cognitive_distortion}'. Make sure they are not aware of the distortion, and the text should demonstrate their behavior subtly. Prompt: {prompt} RESPOND WITH ONLY THE EXAMPLE, NO EXTRA CONTEXT!"

    return generate_response(user_message, 1000)

In [None]:
format = "{desc:<30}{percentage:3.0f}%|{bar}{r_bar}"
for i in trange(1000, leave=False, desc='Iteration', ascii=True, bar_format=format): # run 100 times, resulting in 2000 samples for each distortion
    data = []
    for distortion, definition in tqdm(cognitive_distortions.items(), desc="Distortions", leave=False, bar_format=format):
        for _ in trange(5, desc=distortion, leave=False, bar_format=format):  # 5 samples for each distortion
            try:
                generated_prompt = generate_prompt(distortion).removesuffix('<|eot_id|>')     
                generated_text = generate_text_from_prompt(generated_prompt, distortion).removesuffix('<|eot_id|>')
                
                data.append({
                    "cognitive_distortion": distortion,  # label the cognitive distortion
                    "generated_prompt": generated_prompt,
                    "generated_text": generated_text
                })                
                
            except Exception as e:
                print(f"Error generating text for {distortion}: {e}")

    with open(f'/home/g4/Mindwell/data/{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}.json', 'w') as f:
        json.dump(data, f, indent=4)