In [9]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [1]:
import unsloth
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
from datasets import Dataset
from unsloth import is_bfloat16_supported

# Saving model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Warnings
import warnings
warnings.filterwarnings("ignore")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.6.0+cu118)
    Python  3.10.11 (you have 3.10.0)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
max_seq_length = 1024
dtype = None
load_in_4bit = True
model_name = "unsloth/Llama-3.2-3B-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)


model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=8,
    lora_dropout=0,
    bias = "none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None
)
print(model.print_trainable_parameters())

==((====))==  Unsloth 2025.3.18: Fast Llama patching. Transformers: 4.50.0.
   \\   /|    NVIDIA GeForce GTX 1660 Ti. Num GPUs = 1. Max memory: 6.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu118. CUDA: 7.5. CUDA Toolkit: 11.8. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.18 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


trainable params: 12,156,928 || all params: 3,224,906,752 || trainable%: 0.3770
None


In [3]:
import pandas as pd
import json
from datasets import Dataset, DatasetDict

# Step 1: Load the CSV file
data = pd.read_csv("data/train-eng.csv")
data['Context_length'] = data['post'].apply(len)
filtered_data = data[data['Context_length'] <= 1500]




In [None]:
data_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompt(examples):
    instruction = "You are an AI assistant designed to extract claims from a given passage of text. Keep it short and return the claim in the text. Only return the big idea and exclude unneeded details."
    inputs = examples["post"]
    outputs = examples["normalized claim"]
    texts = []
    for input_, output in zip(inputs, outputs):
        text = data_prompt.format(instruction, input_, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [5]:
from ftfy import fix_text

filtered_data['post'] = filtered_data['post'].apply(fix_text)
filtered_data['normalized claim'] = filtered_data['normalized claim'].apply(fix_text)


training_data = Dataset.from_pandas(filtered_data)
training_data = training_data.map(formatting_prompt, batched=True)

Map:   0%|          | 0/10546 [00:00<?, ? examples/s]

In [6]:
# Print a few examples after formatting to verify
for i in range(3):
    print(f"Example {i}:\n{training_data[i]['text']}\n")

Example 0:
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an AI assistant designed to extract claims from a given passage of text. Keep it short and return the claim in the text. Only return the big idea and exclude unneeded details.

### Input:
Lieutenant Retired General Asif Mumtaz appointed as Chairman Pakistan Medical Commission PMC Lieutenant Retired General Asif Mumtaz appointed as Chairman Pakistan Medical Commission PMC Lieutenant Retired General Asif Mumtaz appointed as Chairman Pakistan Medical Commission PMC None

### Response:
Pakistani government appoints former army general to head medical regulatory body<|end_of_text|>

Example 1:
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an AI assistant designed 

In [7]:
dataset_dict = DatasetDict({
    'train': training_data.shuffle(seed=42).select(range(int(len(training_data) * 0.9))),
    'validation': training_data.shuffle(seed=42).select(range(int(len(training_data) * 0.9), len(training_data)))
})

In [8]:
from transformers import EarlyStoppingCallback

trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=training_data,
    #eval_dataset=dataset_dict['validation'],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=1,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        #per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=100,
        num_train_epochs=1,
        learning_rate=2e-4,        
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="output",

        #evaluation_strategy="steps",
        #eval_steps=10,  # Evaluate every 10 steps
        #save_strategy="steps",
        #save_steps=10,  # Save every 10 steps
        #load_best_model_at_end=True,  # Load the best model when training ends
        #metric_for_best_model="eval_loss",  # Use loss as the metric for selecting best model
        #greater_is_better=False,
    ),
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()


Map:   0%|          | 0/10546 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,546 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 12,156,928/3,000,000,000 (0.41% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.3374
2,0.2751
3,0.2735
4,0.2046
5,0.1067
6,0.0951
7,0.0827
8,0.0874
9,0.0538
10,0.046


TrainOutput(global_step=100, training_loss=0.02193390419241041, metrics={'train_runtime': 8210.8187, 'train_samples_per_second': 0.097, 'train_steps_per_second': 0.012, 'total_flos': 2963990793891840.0, 'train_loss': 0.02193390419241041, 'epoch': 0.07585814526834819})

In [None]:
instruction = "You are an AI assistant designed to extract claims from a given passage of text. Keep it short and return the claim in the text. Only return the big idea and exclude unneeded details."
text="Money is the root of all evil. One time my friend abandoned me just to make a quick buck."

In [None]:
from unsloth import FastLanguageModel

model = FastLanguageModel.for_inference(model)

inference_prompt = """<|begin_of_text|>
### Instruction:
{}

### Input:
{}

### Response:
"""

inference_prompt = inference_prompt.format(instruction, text)

inputs = tokenizer(
[
    inference_prompt
], return_tensors = "pt").to("cuda")

outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
    )
answer = tokenizer.batch_decode(outputs)[0]
answer = answer.split("### Response:")[-1].strip()
print("Answer of the question is:", answer)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.6.0+cu118)
    Python  3.10.11 (you have 3.10.0)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


NameError: name 'model' is not defined

In [12]:
model.save_pretrained("model-good/3B_finetuned_llama3.2")
tokenizer.save_pretrained("model-good/3B_finetuned_llama3.2")

('model-good/3B_finetuned_llama3.2\\tokenizer_config.json',
 'model-good/3B_finetuned_llama3.2\\special_tokens_map.json',
 'model-good/3B_finetuned_llama3.2\\tokenizer.json')

## Load model

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 1024
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="model-good/3B_finetuned_llama3.2",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

model = FastLanguageModel.for_inference(model)

  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.3.18: Fast Llama patching. Transformers: 4.50.0.
   \\   /|    NVIDIA GeForce GTX 1660 Ti. Num GPUs = 1. Max memory: 6.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu118. CUDA: 7.5. CUDA Toolkit: 11.8. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.18 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
instruction = "You are an AI assistant designed to extract claims from a given passage of text. Keep it short and return the claim in the text. Only return the big idea and exclude unneeded details."
text="The Karnofsky Jewish family, who immigrated to the United States from Lithuania, employed a 7-year-old boy and adopted (so to speak) him into their home.  He was originally given homework to get food because he was a starving kid.  He remained under the Jewish families employ, until he was 12  Karnofsky gave him money to buy his first instrument, which was a common instrument in Jewish families.  They really admired his musical talent.Later, when he became a professional … See More The Karnofsky Jewish family, who immigrated to the United States from Lithuania, employed a 7-year-old boy and adopted (so to speak) him into their home.  He was originally given homework to get food because he was a starving kid.  He remained under the Jewish families employ, until he was 12  Karnofsky gave him money to buy his first instrument, which was a common instrument in Jewish families.  They really admired his musical talent.Later, when he became a professional … See More The Karnofsky Jewish family, who immigrated to the United States from Lithuania, employed a 7-year-old boy and adopted (so to speak) him into their home.  He was originally given homework to get food because he was a starving kid.  He remained under the Jewish families employ, until he was 12  Karnofsky gave him money to buy his first instrument, which was a common instrument in Jewish families.  They really admired his musical talent.Later, when he became a professional … See More None"

In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

class StopOnTokens(StoppingCriteria):
    def __init__(self, stop_ids):
        self.stop_ids = stop_ids
    
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_id in self.stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

inference_prompt = """<|begin_of_text|>
### Instruction:
{}

### Input:
{}

### Response:
"""

inference_prompt = inference_prompt.format(instruction, text)

inputs = tokenizer(
[
    inference_prompt
], return_tensors = "pt").to("cuda")

outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id,
        stopping_criteria=[
            StoppingCriteriaList([
                StopOnTokens(
                    stop_ids=[
                        tokenizer.eos_token_id, 
                        tokenizer.convert_tokens_to_ids("### Input:"),
                        tokenizer.convert_tokens_to_ids("### Instruction:")
                    ]
                )
            ])
        ]
    )
answer = tokenizer.batch_decode(outputs)[0]
answer = answer.split("### Response:")[-1].strip()
print("Answer of the question is:", answer)

Answer of the question is: The Karnofsky Jewish family, who immigrated to the United States from Lithuania, employed a 7-year-old boy and adopted (so to speak) him into their home.  He was originally given homework to get food because he was a starving kid.  He remained under the Jewish families employ, until he was 12  Karnofsky gave him money to buy his first instrument, which was a common instrument in Jewish families.  They really admired his musical talent.Later, when he became a professional … See More The Karnofsky Jewish family, who immigrated to the United States from Lithuania, employed a 7-year
