## Installation

In [1]:
#%%capture
%pip install unsloth
# Also get the latest nightly Unsloth!
%pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

Note: you may need to restart the kernel to use updated packages.
Found existing installation: unsloth 2024.11.10
Uninstalling unsloth-2024.11.10:
  Successfully uninstalled unsloth-2024.11.10
Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-al7jp1im
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-al7jp1im
  Resolved https://github.com/unslothai/unsloth.git to commit 8558bc92b06f9128499484ef737fa71b966ffc23
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25ldone
[?25h  Created wheel for unsloth: filename=unsloth-2024.11.10-py3-none-any.whl size=166794 sha256=288d71c76e32eace6692acc6714bb77b1a6c48341617680b34131f45e4f6eea

In [2]:
%pip install tf_keras

Note: you may need to restart the kernel to use updated packages.


## Setup

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm
2024-11-28 16:27:07.997542: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-28 16:27:08.005836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732829228.016142   47804 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732829228.019060   47804 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-28 16:27:08.029893: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.10: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4080. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.10 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


## Dataset Prep

In [3]:
from datasets import Dataset
import pandas as pd

# Define paths to your CSV files
csv_path1 = "./Dataset/good_papers_with_topics.csv"  # Replace with the path to the first CSV
csv_path2 = "./Dataset/bad_papers_with_topics.csv"  # Replace with the path to the second CSV

# Load both CSV files into pandas DataFrames
df1 = pd.read_csv(csv_path1)
df2 = pd.read_csv(csv_path2)

# Concatenate the DataFrames
combined_df = pd.concat([df1, df2], ignore_index=True)

# Convert the combined DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(combined_df)

In [4]:
#Transform .csv data into conversational template
def to_conversational_format(batch):
    # Format the input for the User (Title, Topic, Abstract)
    user_prompts = [
        f"For the given Topic: {identifier}\nAsnwer if the following academic paper is good or bad\nTitle: {title}\nAbstract: {abstract}"
        for title, identifier, abstract in zip(batch["Title"], batch["Identifier"], batch["Abstract"])
    ]
    
    # Format the Assistant response with the Type (good/bad)
    assistant_responses = batch["Type"]
    
    # Build conversation in HuggingFace generic format
    conversations = [
        [
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": response}
        ]
        for user_prompt, response in zip(user_prompts, assistant_responses)
    ]
    
    return {"conversations": conversations}

# Apply the transformation
dataset = dataset.map(to_conversational_format, batched=True,)

Map: 100%|██████████| 9995/9995 [00:00<00:00, 90860.57 examples/s]


In [5]:
#Transform conversational template into llama template
from unsloth.chat_templates import get_chat_template

# Initialize the tokenizer (replace this with your specific tokenizer setup)
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

# Format the prompts using the chat template
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

# Format the dataset with the template
dataset = dataset.map(formatting_prompts_func, batched=True)

# Split the dataset into 75% train and 25% test
splits = dataset.train_test_split(test_size=0.25, seed=42)

# Access the train and test sets
train_dataset_split = splits["train"]
test_dataset_split = splits["test"]

# Print the size of each split to verify
print(f"Train dataset size: {len(train_dataset_split)}")
print(f"Test dataset size: {len(test_dataset_split)}")

# Test if dataset looks good
print(train_dataset_split[2]["text"])

Map: 100%|██████████| 9995/9995 [00:00<00:00, 27584.23 examples/s]

Train dataset size: 7496
Test dataset size: 2499
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

For the given Topic: Acute cardiac injury in patients suffering from COVID-19 infection
Asnwer if the following academic paper is good or bad
Title: SARS CoV-2 Organotropism Associated Pathogenic Relationship of Gut-Brain Axis and Illness.
Abstract: COVID-19 has resulted in a pandemic after its first appearance in a pneumonia patient in China in early December 2019. As per WHO, this global outbreak of novel COVID-19 has resulted in 28,329,790 laboratory-confirmed cases and 911,877 deaths which have been reported from 210 countries as on 12<|eot_id|><|start_header_id|>assistant<|end_header_id|>

bad<|eot_id|>





## Model Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset_split,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2): 100%|██████████| 7496/7496 [00:04<00:00, 1733.24 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [7]:
#Using Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map: 100%|██████████| 7496/7496 [00:01<00:00, 7272.39 examples/s]


In [8]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4080. Max memory = 15.992 GB.
2.725 GB of memory reserved.


In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,496 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 48,627,712


Step,Training Loss
1,7.5187
2,7.9028
3,7.7863
4,7.0809
5,4.0325
6,1.7843
7,1.0011
8,0.3075
9,0.3829
10,0.2265


In [10]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

97.6339 seconds used for training.
1.63 minutes used for training.
Peak reserved memory = 3.848 GB.
Peak reserved memory for training = 1.123 GB.
Peak reserved memory % of max memory = 24.062 %.
Peak reserved memory for training % of max memory = 7.022 %.


## Test Inference

### Manual Test

In [43]:
print("Size of test dataset: ", len(test_dataset_split))
i = 21
#print(test_dataset_split[i]["conversations"])
conversation = test_dataset_split[i]["conversations"]

# Extract the "user" message
user_message = next(
    (message["content"] for message in conversation if message["role"] == "user"), 
    None
)

# Extract the "assistant" response
assistant_response = next(
    (message["content"] for message in conversation if message["role"] == "assistant"), 
    None
)

print("User Message:", user_message)
print("Expected Assistant Response:", assistant_response)

Size of test dataset:  2499
User Message: For the given Topic: The divergent protective effects of angiotensin-converting enzyme inhibitors and angiotensin receptor blockers on clinical outcomes of coronavirus disease 2019 (COVID-19)
Asnwer if the following academic paper is good or bad
Title: Renin-Angiotensin-Aldosterone System Inhibitors and Risk of Covid-19.
Abstract: BACKGROUND: There is concern about the potential of an increased risk related to medications that act on the renin-angiotensin-aldosterone system in patients exposed to coronavirus disease 2019 (Covid-19), because the viral receptor is angiotensin-converting enzyme 2 (ACE2). METHODS: We assessed the relation between previous treatment with ACE inhibitors, angiotensin-receptor blockers, beta-blockers, calcium-channel blockers, or thiazide diuretics and the likelihood of a positive or negative result on Covid-19 testing as well as the likelihood of severe illness (defined as intensive care, mechanical ventilation, or de

In [44]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": user_message},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.batch_decode(outputs)

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nFor the given Topic: The divergent protective effects of angiotensin-converting enzyme inhibitors and angiotensin receptor blockers on clinical outcomes of coronavirus disease 2019 (COVID-19)\nAsnwer if the following academic paper is good or bad\nTitle: Renin-Angiotensin-Aldosterone System Inhibitors and Risk of Covid-19.\nAbstract: BACKGROUND: There is concern about the potential of an increased risk related to medications that act on the renin-angiotensin-aldosterone system in patients exposed to coronavirus disease 2019 (Covid-19), because the viral receptor is angiotensin-converting enzyme 2 (ACE2). METHODS: We assessed the relation between previous treatment with ACE inhibitors, angiotensin-receptor blockers, beta-blockers, calcium-channel blockers, or thiazide diuretics and the likelihood of a po

### Test Full Dataset

In [69]:
from unsloth.chat_templates import get_chat_template
print("Size of test dataset: ", len(test_dataset_split))
# Size is 2499
correct_ans = 0
tries = 250
for index in range(tries):
  #print(test_dataset_split[i]["conversations"])
  conversation = test_dataset_split[index]["conversations"]

  # Extract the user message
  user_message = next(
      (message["content"] for message in conversation if message["role"] == "user"), 
      None
  )

  # Extract expected assistant response
  assistant_response = next(
      (message["content"] for message in conversation if message["role"] == "assistant"), 
      None
  )

#   print("User Message:", user_message)
#   print("Expected Assistant Response:", assistant_response)

  tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
  )
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference

  messages = [
      {"role": "user", "content": user_message},
  ]
  inputs = tokenizer.apply_chat_template(
      messages,
      tokenize = True,
      add_generation_prompt = True, # Must add for generation
      return_tensors = "pt",
  ).to("cuda")

  outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                          temperature = 1.5, min_p = 0.1)
  model_response = str(tokenizer.batch_decode(outputs))
  ans = model_response.split("<|start_header_id|>assistant<|end_header_id|>")[1].split("<|eot_id|>")[0].strip()
  

  if assistant_response in ans:
    correct_ans += 1
  
print("Number of correct answers: ", correct_ans)
print("Number of wrong answers: ", tries-correct_ans)
print("# of tests: ", tries)

Size of test dataset:  2499
Number of correct answers:  160
Number of wrong answers:  90
# of tests:  250


## Save Model

In [70]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')