# Bengali Empathetic Conversations Finetuner

**Install required dependencies**

In [1]:
%%capture
import os
os.environ["UNSLOTH_VLLM_STANDBY"] = "1" # [NEW] Extra 30% context lengths!
!pip install --upgrade -qqq uv
try: import numpy, PIL; get_numpy = f"numpy=={numpy.__version__}"; get_pil = f"pillow=={PIL.__version__}"
except: get_numpy = "numpy"; get_pil = "pillow"
try: import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
except: is_t4 = False
get_vllm, get_triton = ("vllm==0.9.2", "triton==3.2.0") if is_t4 else ("vllm==0.10.2", "triton")
!uv pip install -qqq --upgrade     unsloth {get_vllm} {get_numpy} {get_pil} torchvision bitsandbytes xformers
!uv pip install -qqq --upgrade --no-cache-dir --no-deps unsloth_zoo
!uv pip install -qqq {get_triton}
!uv pip install transformers==4.56.2
!uv pip install --no-deps trl==0.22.2

**Load pre-trained LLM and use LoRA adapter**

In [3]:
import unsloth
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.9, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
    ], 
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

ЁЯже Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-11-29 12:08:58.552386: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764418138.787113      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764418138.852325      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


INFO 11-29 12:09:29 [__init__.py:244] Automatically detected platform cuda.
ERROR 11-29 12:09:31 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8
ЁЯже Unsloth Zoo will now patch everything to make training faster!
INFO 11-29 12:09:45 [vllm_utils.py:702] Unsloth: Patching vLLM v1 graph capture
INFO 11-29 12:09:45 [vllm_utils.py:732] Unsloth: Patching vLLM v0 graph capture
==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.56.2. vLLM: 0.9.2.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit with actua

`torch_dtype` is deprecated! Use `dtype` instead!


INFO 11-29 12:10:04 [config.py:1472] Using max model len 1024
INFO 11-29 12:10:06 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=2048.
Unsloth: vLLM Bitsandbytes config using kwargs = {'load_in_8bit': False, 'load_in_4bit': True, 'bnb_4bit_compute_dtype': 'float16', 'bnb_4bit_quant_storage': 'uint8', 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'llm_int8_skip_modules': ['lm_head', 'multi_modal_projector', 'merger', 'modality_projection', 'model.layers.1.mlp'], 'llm_int8_threshold': 6.0}
INFO 11-29 12:10:06 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.2) with config: model='unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit', speculative_config=None, tokenizer='unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dty

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

INFO 11-29 12:10:09 [cuda.py:311] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 11-29 12:10:09 [cuda.py:360] Using XFormers backend.
INFO 11-29 12:10:10 [parallel_state.py:1076] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 11-29 12:10:10 [model_runner.py:1171] Starting to load model unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit...


[W1129 12:10:10.556900138 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W1129 12:10:10.557614908 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 11-29 12:10:11 [bitsandbytes_loader.py:499] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 11-29 12:10:11 [weight_utils.py:292] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

INFO 11-29 12:10:33 [weight_utils.py:308] Time spent downloading weights for unsloth/meta-llama-3.1-8b-instruct-unsloth-bnb-4bit: 21.891124 seconds
INFO 11-29 12:10:33 [weight_utils.py:345] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 11-29 12:10:38 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 11-29 12:10:40 [model_runner.py:1203] Model loading took 5.7659 GiB and 27.962490 seconds
INFO 11-29 12:10:58 [worker.py:294] Memory profiling takes 17.05 seconds
INFO 11-29 12:10:58 [worker.py:294] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.89) = 13.14GiB
INFO 11-29 12:10:58 [worker.py:294] model weights take 5.77GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 0.90GiB; the rest of the memory reserved for KV Cache is 6.45GiB.
INFO 11-29 12:10:58 [executor_base.py:113] # cuda blocks: 3302, # CPU blocks: 2048
INFO 11-29 12:10:58 [executor_base.py:118] Maximum concurrency for 1024 tokens per request: 51.59x
INFO 11-29 12:11:01 [vllm_utils.py:737] Unsloth: Running patched vLLM v0 `capture_model`.
INFO 11-29 12:11:01 [model_runner.py:1513] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To ru

Capturing CUDA graph shapes:   0%|          | 0/27 [00:00<?, ?it/s]

INFO 11-29 12:11:28 [model_runner.py:1671] Graph capturing finished in 28 secs, took 0.59 GiB
INFO 11-29 12:11:28 [vllm_utils.py:744] Unsloth: Patched vLLM v0 graph capture finished in 28 secs.
INFO 11-29 12:11:30 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 49.90 seconds
Unsloth: Just some info: will skip parsing ['q_norm', 'layer_norm2', 'pre_feedforward_layernorm', 'post_attention_layernorm', 'attention_norm', 'norm2', 'ffn_norm', 'post_layernorm', 'norm', 'layer_norm1', 'norm1', 'input_layernorm', 'k_norm', 'post_feedforward_layernorm']
Performing substitution for additional_keys=set()
Unsloth: Just some info: will skip parsing ['q_norm', 'layer_norm2', 'pre_feedforward_layernorm', 'post_attention_layernorm', 'attention_norm', 'norm2', 'ffn_norm', 'post_layernorm', 'cross_attn_post_attention_layernorm', 'norm', 'cross_attn_input_layernorm', 'layer_norm1', 'norm1', 'input_layernorm', 'k_norm', 'post_feedforward_layernorm']


tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.11.4 patched 32 layers with 32 QKV layers, 32 O layers and 0 MLP layers.


In [4]:
print(tokenizer.vocab_size)

128000


**Checking the template of tokenizer**

In [26]:
messages = [
    {'role':'user','content':"say something"},
    {'role':'assistant','content':"I am giving up on you"},
    {'role':'user','content':"sooo sad"},
]

inputs = tokenizer.apply_chat_template(messages, tokenize = False)
print(inputs)



<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

say something<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I am giving up on you<|eot_id|><|start_header_id|>user<|end_header_id|>

sooo sad<|eot_id|>


**Data preprocessing and creating dataset**

In [22]:
from datasets import Dataset
import pandas as pd
import unicodedata
import re

df = pd.read_csv("/kaggle/input/bengali-empathetic-conversations-corpus/BengaliEmpatheticConversationsCorpus .csv")
df = pd.DataFrame(df)

def _normalize_text(text):
        text = unicodedata.normalize("NFC", text)
        text = re.sub(r"[\u0000-\u001F\u007F-\u009F]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

for column in df.columns:
    if df[column].dtype == 'object':  
        df[column] = df[column].astype(str).apply(_normalize_text)


#df.head()
dataset = Dataset.from_pandas(df)
dataset = dataset.rename_columns({"Question-Title": "Title"}) # '-' will casue error in sharegpt formatting

print(dataset.column_names)
#print(dataset[0])


['Topics', 'Title', 'Questions', 'Answers']


**merging columns so we have only 2 and sharegpt formatting**

In [23]:
from unsloth import to_sharegpt
dataset = to_sharegpt(
    dataset,
    merged_prompt = \
        "[[ржмрж┐рж╖ржпрж╝ рж╣рж▓ {Topics}]]"\
        "[[\nржкрзНрж░рж╢рзНржирзЗрж░ рж╢рж┐рж░рзЛржирж╛ржо рж╣рж▓ {Title}]]"\
        "[[\nржкрзНрж░рж╢рзНржи рж╣рж▓ {Questions}]]",
    conversation_extension = 5, 
    output_column_name = "Answers",
)

Merging columns:   0%|          | 0/38233 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/38233 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/38233 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/38233 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/38233 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/38233 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/38233 [00:00<?, ? examples/s]

Extending conversations:   0%|          | 0/38233 [00:00<?, ? examples/s]

In [24]:
from pprint import pprint
pprint(dataset[0])

{'conversations': [{'from': 'human',
                    'value': 'ржмрж┐рж╖ржпрж╝ рж╣рж▓ ржкрж╛рж░рж┐ржмрж╛рж░рж┐ржХ ржжрзНржмржирзНржжрзНржм\n'
                             'ржкрзНрж░рж╢рзНржирзЗрж░ рж╢рж┐рж░рзЛржирж╛ржо рж╣рж▓ ржорж╛ ржУ рж╕рзНрждрзНрж░рзАрж░ ржоржзрзНржпрзЗ ржорждрж╛ржирзИржХрзНржп '
                             'ржмрзГржжрзНржзрж┐\n'
                             'ржкрзНрж░рж╢рзНржи рж╣рж▓ ржЖржорж╛рж░ рж╕рзНрждрзНрж░рзА ржПржмржВ ржорж╛ржпрж╝рзЗрж░ ржоржзрзНржпрзЗ ржЯрж╛ржиржЯрж╛ржи '
                             'ржорждржмрж┐рж░рзЛржз ржЪрж▓ржЫрзЗред ржЕрждрзАрждрзЗ, рждрж╛ржжрзЗрж░ ржоржзрзНржпрзЗ ржЫрзЛржЯржЦрж╛ржЯрзЛ '
                             'ржкрж╛рж░рзНржержХрзНржп ржЫрж┐рж▓ред ржЙржжрж╛рж╣рж░ржгрж╕рзНржмрж░рзВржк, ржЖржорж╛рж░ рж╕рзНрждрзНрж░рзА ржЖржорж╛рж░ '
                             'ржХрж╛ржЫрзЗ ржЕржнрж┐ржпрзЛржЧ ржХрж░ржмрзЗ ржпрзЗ ржЖржорж╛рж░ ржорж╛ ржЦрзБржм '
                             'ржХрж░рзНрждрзГрждрзНржмржкрзНрж░ржпрж╝рж╛рж╕рзА; 

**using standarize to keep the tags(user, assistant) correct **

In [25]:
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(dataset)

Unsloth: Standardizing formats (num_proc=4):   0%|          | 0/38233 [00:00<?, ? examples/s]

**Specifying chat templete (llama-3 in this case) and use custom system prompt for better fine-tuning**

In [28]:
SYSTEM_PROMPT = "ржЖржкржирж┐ ржПржХржЬржи рж╕рж╣рж╛ржирзБржнрзВрждрж┐рж╢рзАрж▓ ржмрж╛ржВрж▓рж╛ ржХрж╛ржЙржирзНрж╕рзЗрж▓рж░ред ржЖржкржирж┐ ржЦрзБржм ржзрзАрж░рзЗ, ржиржорзНрж░ржнрж╛ржмрзЗ ржПржмржВ рж╕ржорзНржорж╛ржиржЬржиржХ ржнржЩрзНржЧрж┐рждрзЗ ржЙрждрзНрждрж░ ржжрзЗржмрзЗржиред ржмрзНржпржХрзНрждрж┐рж░ ржЕржирзБржнрзВрждрж┐ржХрзЗ рж╕рзНржмрзАржХрж╛рж░ ржХрж░ржмрзЗржи, ржЖрж╢рзНржмрж╛рж╕ ржжрзЗржмрзЗржи ржПржмржВ ржкрзНрж░рзЯрзЛржЬржи рж╣рж▓рзЗ ржкрзЗрж╢рж╛ржжрж╛рж░ рж╕рж╛рж╣рж╛ржпрзНржп ржирзЗржУрзЯрж╛рж░ ржкрж░рж╛ржорж░рзНрж╢ ржжрзЗржмрзЗржи, ржХрж┐ржирзНрждрзБ ржХрзЛржи ржЪрж┐ржХрж┐рзОрж╕рж╛ ржмрж╛ ржЖржЗржирж┐ ржкрж░рж╛ржорж░рзНрж╢ ржжрзЗржмрзЗржи ржирж╛ред"

chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

SYSTEM_PROMPT<|eot_id|><|start_header_id|>user<|end_header_id|>

{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{OUTPUT}<|eot_id|>"""

from unsloth import apply_chat_template
dataset = apply_chat_template(
    dataset,
    tokenizer = tokenizer,
    chat_template = chat_template,
)

print(dataset.column_names)
print(dataset[0])

Map:   0%|          | 0/38233 [00:00<?, ? examples/s]

['conversations', 'text']
{'conversations': [{'content': 'ржмрж┐рж╖ржпрж╝ рж╣рж▓ ржкрж╛рж░рж┐ржмрж╛рж░рж┐ржХ ржжрзНржмржирзНржжрзНржм\nржкрзНрж░рж╢рзНржирзЗрж░ рж╢рж┐рж░рзЛржирж╛ржо рж╣рж▓ ржорж╛ ржУ рж╕рзНрждрзНрж░рзАрж░ ржоржзрзНржпрзЗ ржорждрж╛ржирзИржХрзНржп ржмрзГржжрзНржзрж┐\nржкрзНрж░рж╢рзНржи рж╣рж▓ ржЖржорж╛рж░ рж╕рзНрждрзНрж░рзА ржПржмржВ ржорж╛ржпрж╝рзЗрж░ ржоржзрзНржпрзЗ ржЯрж╛ржиржЯрж╛ржи ржорждржмрж┐рж░рзЛржз ржЪрж▓ржЫрзЗред ржЕрждрзАрждрзЗ, рждрж╛ржжрзЗрж░ ржоржзрзНржпрзЗ ржЫрзЛржЯржЦрж╛ржЯрзЛ ржкрж╛рж░рзНржержХрзНржп ржЫрж┐рж▓ред ржЙржжрж╛рж╣рж░ржгрж╕рзНржмрж░рзВржк, ржЖржорж╛рж░ рж╕рзНрждрзНрж░рзА ржЖржорж╛рж░ ржХрж╛ржЫрзЗ ржЕржнрж┐ржпрзЛржЧ ржХрж░ржмрзЗ ржпрзЗ ржЖржорж╛рж░ ржорж╛ ржЦрзБржм ржХрж░рзНрждрзГрждрзНржмржкрзНрж░ржпрж╝рж╛рж╕рзА; ржЖржорж╛рж░ ржорж╛ ржЕржнрж┐ржпрзЛржЧ ржХрж░ржмрзЗржи ржЖржорж╛рж░ рж╕рзНрждрзНрж░рзА ржЕрж▓рж╕ред рждржмрзЗ ржЗржжрж╛ржирзАржВ рждрж╛ рждрзАржмрзНрж░рждрж░ рж╣ржпрж╝рзЗржЫрзЗ ред ржЖржорж┐ ржоржирзЗ ржХрж░рж┐, ржПрж░ ржХрж╛рж░ржг р

**Train the model**

In [32]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=1024,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        warmup_ratio=0.03,
        max_steps = 60,
        #num_train_epochs=1,
        learning_rate=2e-4,
        fp16 = not is_bfloat16_supported(), # Use 16-bit floating point if bfloat16 isn't supported.
        bf16 = is_bfloat16_supported(),     #use bf16 if hardware support
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

# Train
trainer_stats = trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/38233 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 38,233 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 27,262,976 of 8,057,524,224 (0.34% trained)


Step,Training Loss
10,0.7614
20,0.598
30,0.577
40,0.5553
50,0.5531
60,0.5422
