In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import os, torch
from datasets import load_dataset
from trl import SFTTrainer
import json
import bitsandbytes as bnb
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm.autonotebook import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

In [3]:
base_model = "/home/g4/Llama-3.2-3B-Instruct"
new_model="/home/g4/Llama-3.2-3B-Instruct-Finetuned-combined"
def load_llama_model():
    """Load LLaMA model and tokenizer"""
    model_name = base_model
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=bnb_config,
        device_map="auto",
        attn_implementation=attn_implementation
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    if model.config.pad_token_id is None:
        model.config.pad_token_id = model.config.eos_token_id

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # model = model.to(device)


    return model, tokenizer, device

model, tokenizer, device = load_llama_model()

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]


Using device: cuda


In [4]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [5]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model = get_peft_model(model, peft_config)

In [6]:
distortions = {
    "fortune_telling": "Fortune Telling/Catastrophizing",
    "what_if_statements": "What If Statements",
    "labeling": "Labeling/Global Labeling",
    "unfair_comparisons": "Unfair Comparisons",
    "mind_reading_conclusion": "Mind Reading",
    "should_statements_conclusion": "Should Statements",
    "overgeneralization": "Overgeneralization",
    "all_or_nothing_thinking": "All or Nothing Thinking",
    "blaming_conclusion": "Blaming",
    "emotional_reasoning": "Emotional Reasoning",
    "mental_filter": "Mental Filter",
    "discounting_the_positive": "Discounting the Positive",
    "magnification_minimization": "Magnification/Minimization",
    "personalization_conclusion": "Personalization",
    "jumping_to_conclusions": "Jumping to Conclusions",
    "no_distortion": "No Distortion"
}

name_to_node = {
    "Fortune Telling": "fortune_telling",
    "What if?": "what_if_statements",
    "Labeling/Global Labeling": "labeling",
    "Unfair Comparisons": "unfair_comparisons",
    "Mind Reading": "mind_reading_conclusion",
    "Should Statements": "should_statements_conclusion",
    "Overgeneralization": "overgeneralization",
    "All or Nothing Thinking": "all_or_nothing_thinking",
    "Blaming": "blaming_conclusion",
    "Emotional Reasoning": "emotional_reasoning",
    "Mental Filter": "mental_filter",
    "Discounting the Positive": "discounting_the_positive",
    "Magnification/Minimization": "magnification_minimization",
    "Personalization": "personalization_conclusion",
    "Jumping to Conclusions": "jumping_to_conclusions",
    "No Distortion": "no_distortion"
}

# Define the decision tree structure
decision_tree = {
    "root": {
        "question": "Is the thought pattern about future events or predictions?",
        "yes": "future",
        "no": "evidence_check"
    },
    "evidence_check": {
        "question": "Is the thought pattern about making conclusions or assumptions?",
        "yes": "evidence",
        "no": "evaluation_check"
    },
    "evaluation_check": {
        "question": "Is the thought pattern about evaluating or judging self/others/situations?",
        "yes": "evaluation",
        "no": "attention_check"
    },
    "attention_check": {
        "question": "Is the thought pattern about focusing attention on specific aspects?",
        "yes": "attention",
        "no": "responsibility_check"
    },
    "responsibility_check": {
        "question": "Is the thought pattern about assigning responsibility or causation?",
        "yes": "responsibility",
        "no": "no_distortion"
    },
    "future": {
        "question": "Are the predictions catastrophic or unbearable?",
        "yes": "fortune_telling",
        "no": "what_if"
    },
    "what_if": {
        "question": "Are there repeated 'what if' questions?",
        "yes": "what_if_statements",
        "no": "no_distortion"
    },
    "evidence": {
        "question": "Are conclusions drawn without sufficient evidence?",
        "yes": "jumping_to_conclusions",
        "no": "evidence_type"
    },
    "evidence_type": {
        "question": "Are emotions used as the primary evidence?",
        "yes": "emotional_reasoning",
        "no": "assumed_thoughts"
    },
    "assumed_thoughts": {
        "question": "Are others' thoughts/intentions assumed without evidence?",
        "yes": "mind_reading",
        "no": "no_distortion"
    },
    "evaluation": {
        "question": "Does the evaluation involve extreme categories?",
        "yes": "extremes",
        "no": "standards_check"
    },
    "extremes": {
        "question": "Does it use words like 'always', 'never', 'every time'?",
        "yes": "overgeneralization",
        "no": "two_options"
    },
    "two_options": {
        "question": "Is everything sorted into only two categories?",
        "yes": "all_or_nothing",
        "no": "no_distortion"
    },
    "standards_check": {
        "question": "Is it about how things 'should' be?",
        "yes": "should_statements",
        "no": "comparison_check"
    },
    "comparison_check": {
        "question": "Is it about comparing to others?",
        "yes": "unfair_comparisons",
        "no": "labels_check"
    },
    "labels_check": {
        "question": "Is it about applying fixed, global labels?",
        "yes": "labeling",
        "no": "no_distortion"
    },
    "attention": {
        "question": "Is there selective focus on specific aspects?",
        "yes": "mental_filter",
        "no": "positive_negative_check"
    },
    "positive_negative_check": {
        "question": "Is it about weighing positives versus negatives?",
        "yes": "weighing_check",
        "no": "no_distortion"
    },
    "weighing_check": {
        "question": "Is it exclusively focusing on negatives?",
        "yes": "magnification_minimization",
        "no": "positive_dismiss"
    },
    "positive_dismiss": {
        "question": "Are positive experiences being dismissed?",
        "yes": "discounting_the_positive",
        "no": "no_distortion"
    },
    "responsibility": {
        "question": "Is external behavior seen as personally directed?",
        "yes": "personalization",
        "no": "blame_check"
    },
    "blame_check": {
        "question": "Is there complete attribution of responsibility to self or others?",
        "yes": "blaming",
        "no": "no_distortion"
    }
}

distortion_definitions = """All or Nothing Thinking/Polarized Thinking: I view a situation, a person or an event in “either-or” terms, fitting them into only two extreme categories instead of on a continuum.
Fortune telling (also called catastrophizing): I predict the future in negative terms and believe that what will happen will be so awful that I will not be able to stand it.
Emotional reasoning:  I believe my emotions reflect reality and let them guide my attitudes and judgments.
Labeling/Global Labeling: I put a fixed, global label, usually negative, on myself or others.
Mental Filter(): I pay attention to one or a few details and fail to see the whole picture.
Mind reading: I believe that I know the thoughts or intentions of others (or that they know my thoughts or intentions) without having sufficient evidence
Overgeneralization: I take isolated negative cases and generalize them, transforming them in a never-ending pattern, by repeatedly using words such as “always”, “never”, “ever”, “whole”, “entire”, etc
Personalization: I assume that others’ behaviors and external events concern (or are directed to) myself without considering other plausible explanations.
Should statements (also “musts”, “oughts”, “have tos”): I tell myself that events, people’s behaviors, and my own attitudes “should” be the way I expected them to be and not as they really are.
Blaming (others or oneself): I direct my attention to others as sources of my negative feelings and experiences, failing to consider my own responsibility; or, conversely, I take responsibility for others’ behaviors and attitudes.
What if?: I keep asking myself questions such as “what if something happens?”
Discounting the positive: I disqualify positive experiences or events insisting that they do not count.
Magnification/minimization: I evaluate myself, others, and situations placing greater importance on the negatives and/or placing much less importance on the positives.
Jumping to conclusions (also called arbitrary inference): I draw conclusions (negative or positive) from little or no confirmatory evidence.
Unfair comparisons: I compare myself with others who seem to do better than I do and place myself in a disadvantageous position.
"""
system_message = """You are a helpful assistant that can only respond with "Yes" or "No". Follow these rules:
1. You must answer with exactly "Yes" or "No" (case-sensitive)
2. Do not add any other words or punctuation
3. If you are not completely certain, answer "No"
4. These are the only two valid responses: "Yes" or "No\""""

def is_leaf_node(node_id: str) -> bool:
    """Check if a node is a leaf node (final distortion)."""
    return isinstance(decision_tree[node_id], str)

def get_distortion_description(distortion: str) -> str:
    """Get the description of a specific distortion."""
    return distortions.get(distortion, "Unknown distortion")

In [7]:
def make_llama(text: str) -> str:
    return [{"role": "system", "content": system_message,}, {"role": "user", "content": text},]
def make_example(text: str, answer: str):
    return make_llama(text) + [{"role": "assistant", "content": answer}]
def make_question(text: str, question: str):
    return f'Given the following text:\n\n{text}\n\n Answer the question: {question}\n\n'

In [8]:
dataset = load_dataset('json', data_files='/home/g4/Mindwell/data/combined_decision_tree_training_data.json', split='train')
dataset = dataset.shuffle(seed=65)
def format_chat_template(entry):
    
    row_json = [{"role": "system", "content": system_message },
               {"role": "user", "content": make_question(entry['text'], entry['question'])},
               {"role": "assistant", "content": entry["answer"]}]
    
    entry["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return entry

dataset = dataset.map(
    format_chat_template,
    num_proc=4
)
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=2,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to=[],  # Disables wandb
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
)

In [10]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False
)

model.config.use_cache = False


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 124425/124425 [01:05<00:00, 1892.86 examples/s]
Map: 100%|██████████| 13825/13825 [00:07<00:00, 1869.74 examples/s]


In [11]:
trainer.train()

  0%|          | 0/31106 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 23.66 GiB of which 69.12 MiB is free. Including non-PyTorch memory, this process has 22.89 GiB memory in use. Of the allocated memory 21.96 GiB is allocated by PyTorch, and 641.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Save the model locally only
trainer.save_model(new_model)
tokenizer.save_pretrained(new_model)

('/home/g4/Llama-3.2-3B-Instruct-Finetuned/tokenizer_config.json',
 '/home/g4/Llama-3.2-3B-Instruct-Finetuned/special_tokens_map.json',
 '/home/g4/Llama-3.2-3B-Instruct-Finetuned/tokenizer.json')