In [1]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

## **Model**

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 3000
dtype = None
load_in_4bit = True

model_original, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4070 Ti SUPER. Max memory: 15.688 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth




In [3]:
# comments in this cell came from Unsloth tutorial
model = FastLanguageModel.get_peft_model(
    model_original,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.9.post3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Prepare Datasets for OPs and Pairs

In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

In [5]:
op_instruction = "You're a semantic analyst. Now I will show you a person's opinion statement, who publicly announced his/her argument and encouraged other people to challenge it. Judging from his/her speeching style and lexical features, do you think he/she is resistant or malleable to persuasion? First answer with resistant or malleable, then very briefly explain your analysis."

pairs_instruction = "This is a conversation from an online discussion community. The first was a poster who posted an opinion, and the next two replies were each trying to convince the poster to revise his opinion. The two responses were similar, but one managed to convince the poster and the other didn't. Now judge which response succeeded in persuading."

In [6]:
with open("finetune_datasets/op_gpt_explanations.jsonl", "r") as f:
    import json
    explanations = [json.loads(line) for line in f]

# Knowledge are added to all the training in this notebook 
knowledge_op = "Hints: 1. Use of First-Person Pronouns: The use of first-person singular pronouns (e.g., I, me) is strongly correlated with malleability while the use of first-person plural pronouns (e.g., we, us) is more associated with resistant opinions. 2. Dominance in Language: Higher dominance in the language used by the OP correlates with malleability. 3. Calm Tone: calmer, less emotional language in the original post is associated with malleability. 4. Valence (Emotional Positivity): Higher valence, which reflects more positive emotional tone, indicates malleability. 5.Formatting: Posts that are well-organized, with more paragraphs and formatting such as bolds and bullet lists, are correlated with malleable opinions."

def formatting_prompts_func(dataset):
    inputs = dataset["input"]
    outputs = [line + '\n' + exp for line, exp in zip(dataset["output"], explanations)]
    instructions = [op_instruction + knowledge_op]  * len(inputs)
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import load_dataset
op_dataset = load_dataset("json", data_files={"train": "op_train_alpaca.jsonl",
                                           "test": "op_test_alpaca.jsonl",})

op_dataset_train = op_dataset["train"]
op_dataset_train = op_dataset_train.map(formatting_prompts_func, batched=True)

In [7]:
with open("finetune_datasets/pairs_gpt_explanations.jsonl", "r") as f:
    import json
    explanations = [json.loads(line) for line in f]

# Knowledge are added to all the training in this notebook 
knowledge_pairs = "Hint: 1. Reply Length: Longer replies tend to be more persuasive, as they can convey more information and elaborate on points effectively. 2. Language Dissimilarity with Original Post: Persuasive replies use different content words but match in stopwords. 3. Links and Evidence: Including links as evidence in an argument increases the chances of persuasion. 4. Calmer Tone: Replies that use calmer, less intense language are more likely to persuade, as they come across as more composed. 5. Positive Emotion and Sentiment: Persuasive replies include a mix of positive and negative sentiment."

def formatting_prompts_func(dataset):
    inputs = dataset["input"]
    outputs = [line + '\n' + exp for line, exp in zip(dataset["output"], explanations)]
    instructions = [pairs_instruction + knowledge_pairs]  * len(inputs)
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

from datasets import load_dataset
pairs_dataset = load_dataset("json", data_files={"train": "pairs_train_alpaca.jsonl",
                                           "test": "pairs_test_alpaca.jsonl",})

pairs_dataset_train = pairs_dataset["train"]
pairs_dataset_train = pairs_dataset_train.map(formatting_prompts_func, batched=True)

# Training

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = op_dataset_train, 
    # train_dataset = pairs_dataset_train, # Uncomment this to finetune with pairs data
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.2274
2,2.2706
3,2.2786
4,2.2105
5,2.1345
6,2.0615
7,1.8819
8,1.8394
9,1.709
10,1.5664


In [10]:
from tqdm import tqdm
def infer(dataset, model, instruction):
  results = []
  for line in tqdm(dataset, desc="Inferring on test set: "):
      inputs = tokenizer(
      [
          alpaca_prompt.format(
              instruction,
              line["input"], 
              "") 
      ], return_tensors = "pt").to("cuda")
      outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
      result = tokenizer.batch_decode(outputs)[0]
      start_mark = "### Response:\n"
      start_point = result.find(start_mark) + len(start_mark)
      result = result[start_point: start_point + 1].lower()

      results.append(result)
  return results

# Inferences on Test Set

In [11]:
# OP model inferences
op_test_dataset = op_dataset["test"]
FastLanguageModel.for_inference(model) 

op_ft_results = infer(op_test_dataset, model, op_instruction + knowledge_op)

Inferring on test set: 100%|██████████| 200/200 [04:06<00:00,  1.23s/it]


In [11]:
# Pairs model inferences
pairs_test_dataset = pairs_dataset["test"]
FastLanguageModel.for_inference(model)

pairs_ft_results = infer(pairs_test_dataset, model, pairs_instruction + knowledge_pairs)

Inferring on test set: 100%|██████████| 200/200 [04:47<00:00,  1.44s/it]


In [12]:
def accuracy(results, targets):
    results = [result.lower()[:1] for result in results]
    
    # check if all answers are generated in correct format
    def check_pred(preds):
        n_mismatches = 0
        for i, p in (enumerate(preds)):
            if p not in ['f', 's']:
                if p not in ['m', 'r']:
                    n_mismatches += 1
        if n_mismatches:
            print(f"{n_mismatches}/{len(preds)} "
              f"of the predictions are not in correct format! "
              f"They will not be included in counting accuracy.")
        return n_mismatches
    n_mismatches = check_pred(results)
    
    correct_count = 0
    total = len(results)
    for i in range(total):
      if results[i] == targets[i]:
        correct_count += 1

    return (correct_count - n_mismatches) / (total - n_mismatches)

In [13]:
# Accuracy of OP model
truths= [line["output"][:1] for line in op_test_dataset]

accu_ft = accuracy(op_ft_results, truths)
print(f"The finetuned model accuracy on test set is {accu_ft:.3f}")

The finetuned model accuracy on test set is 0.565


In [13]:
# Accuracy of Pairs model
truths= [line["output"][:1] for line in pairs_test_dataset]

accu_ft = accuracy(pairs_ft_results, truths)
print(f"The finetuned model accuracy on test set is {accu_ft:.3f}")

The finetuned model accuracy on test set is 0.530
