# Tải thư viện

In [10]:
%%capture
!pip install unsloth
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git""
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install transformers dataset

# Setting up LLaMA 3.2 model

In [2]:
# Models Loading
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048*8 # Choose any! Unsloths auto supports RoPE Scaling internally!
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B" # base version of Llama-3.2
    max_seq_length = max_seq_length,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.9: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.11.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


# Load data

In [4]:
from datasets import load_dataset

alpaca_dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")


README.md:   0%|          | 0.00/542 [00:00<?, ?B/s]

(…)-00000-of-00001-5e7cb295b9cff0bf.parquet:   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

In [5]:
# Format
alpaca_prompt = """Dưới đây là hướng dẫn mô tả một nhiệm vụ. Viết một phản hồi hoàn thành yêu cầu một cách thích hợp.

### Nhiệm vụ:
{}

### Đầu vào:
{}

### Câu trả lời:
{}"""

EOS_TOKEN = tokenizer.eos_token # end with eos
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts}

alpaca_dataset = alpaca_dataset.map(formatting_prompts_func, batched = True)



Map:   0%|          | 0/112165 [00:00<?, ? examples/s]

In [6]:
# Print checking
print(alpaca_dataset['train'][11000])

{'instruction': "If you are a doctor, please answer the medical questions based on the patient's description.", 'input': 'Hello Doctor, i am a 24 year old Male. i had met with an accident couple of months ago, and had a severe fracture, my right hand radius and alna bone had broken. after the surgery my doctor told me to take GEMCAL NASAL SPRAY, my question is does it have any side effect with my fracture.??', 'output': 'Hi, It does not have any side effects involving the fractures. Gemcal nasal spray (Calcitonin) is commonly prescribed to treat osteoporosis in postmenopausal women and also in elderly men to make the bones stronger and lower the risk of fractures. Its common side effects include nasal congestion, rhinitis, diarrhea, and flushing. It is not known to have any harmful effects on the fractured bones. Hope I have answered your query. Let me know if I can assist you further.', 'text': "Dưới đây là hướng dẫn mô tả một nhiệm vụ. Viết một phản hồi hoàn thành yêu cầu một cách th

# Continual Train

In [7]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments
from unsloth import is_bfloat16_supported


# training mode
FastLanguageModel.for_training(model)

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset=alpaca_dataset['train'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        # Use num_train_epochs and warmup_ratio for longer runs!
        max_steps = 120,
        warmup_steps = 10,
        warmup_ratio = 0.1,
        num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        # report_to = "none", # Use this for WandB etc
    ),
)
# wandb: 9bbda4ddb46bdb43040d3abaa192ab7b8e2d8626

Map (num_proc=8):   0%|          | 0/112165 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 112,165 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 120
 "-____-"     Number of trainable parameters = 812,318,720


Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for lm_head.


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


CompilationError: at 60:16:

    accum = tl.zeros((BLOCK_B, BLOCK_V), dtype=tl.float32)
    for d in range(0, tl.cdiv(D, BLOCK_D)):
        # Load the next block of A and B, generate a mask by checking the K dimension.
        # If it is out of bounds, set it to 0.
        if EVEN_D:
            e = tl.load(e_ptrs)
            c = tl.load(c_ptrs)
        else:
            e = tl.load(e_ptrs, mask=offs_d[None, :] < D - d * BLOCK_D, other=0.0)
            c = tl.load(c_ptrs, mask=offs_d[:, None] < D - d * BLOCK_D, other=0.0)
        accum = tl.dot(e, c, accum, input_precision=DOT_PRECISION)
                ^

test

In [None]:
# Inference Testing
## Inference Mode
FastLanguageModel.for_inference(model)

inputs = tokenizer(
[
    alpaca_prompt.format(
        "If you are a doctor, please answer the medical questions based on the patient's description.", # instruction
        "I woke up feel nauseous.", # input - leave blank if there is no input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
outputs = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 200)

In [None]:
# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/fine_tuned_llama3b')
tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/fine_tuned_llama3b')


# Connect tele

In [None]:
%%capture
!pip install -U python-telegram-bot
!pip install nest_asyncio
!pip install -U bitsandbytes

In [None]:
from telegram import Update
from telegram.ext import Application, CommandHandler, MessageHandler, filters, ContextTypes
from transformers import AutoTokenizer, TextStreamer
from unsloth import FastLanguageModel
import torch
import nest_asyncio

# Khắc phục lỗi vòng lặp sự kiện
nest_asyncio.apply()

# Cài đặt thiết bị
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Đường dẫn mô hình và tokenizer của bạn
model_name = '/content/drive/MyDrive/Colab Notebooks/fine_tuned_llama3b'

# Tải mô hình và tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model,_ = FastLanguageModel.from_pretrained(model_name)
model.to(device)

FastLanguageModel.for_inference(model)

# Định nghĩa prompt Alpaca
alpaca_prompt = """Dưới đây là hướng dẫn mô tả một nhiệm vụ. Viết một phản hồi hoàn thành yêu cầu một cách thích hợp.

### Nhiệm vụ:
{}

### Đầu vào:
{}

### Câu trả lời:
{}"""

# Hàm tạo phản hồi từ mô hình
async def generate_response(user_input):
    try:
        if user_input.lower() in ['bye', 'goodbye']:
            return 'See you later!'
        elif user_input.lower() in ['hello', 'hi']:
            return 'Hi there! How can I assist you today?'
        elif user_input.lower() == 'thanks' or user_input.lower() == 'thank you':
            return "You're welcome! If you have more questions, feel free to ask."

        inputs = tokenizer(
            [
                alpaca_prompt.format(
                    "If you are a doctor, please answer the medical questions based on the patient's description.",
                    user_input,
                    ""
                )
            ],
            return_tensors="pt"
        ).to(device)

        text_streamer = TextStreamer(tokenizer)
        outputs = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128*2)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Chỉ lấy phần sau "Câu trả lời:"
        answer_start = response.find("Câu trả lời:")
        if answer_start != -1:
            return response[answer_start + len("Câu trả lời:"):].strip()
        else:
            return response.strip()  # Nếu không tìm thấy, trả về toàn bộ phản hồi

    except Exception as e:
        print(f"Lỗi xảy ra: {e}")  # In lỗi ra để dễ debug nếu cần
        return "Đã có lỗi xảy ra, vui lòng thử lại sau."


# Hàm xử lý khi người dùng gửi lệnh /start
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
    await update.message.reply_text("Xin chào! Tôi là chatbot hỗ trợ tư vấn y tế. Hãy gửi câu hỏi của bạn.")

# Hàm xử lý tin nhắn từ người dùng
async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
    user_input = update.message.text
    response = await generate_response(user_input)
    await update.message.reply_text(response)

# Hàm chính để khởi động bot
def main():
    TOKEN = '7556701680:AAEK4XVtB7lvHOFMpmPKGJndyE8UVoI-kjA'
    app = Application.builder().token(TOKEN).build()

    # Thêm handler cho lệnh /start
    app.add_handler(CommandHandler("start", start))

    # Thêm handler cho tin nhắn văn bản
    app.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))

    # Khởi động bot
    app.run_polling()

if __name__ == '__main__':
    main()


==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

# Colab & Github

#### Thêm, commit và đẩy code lên GitHub

In [None]:
# Thêm các thay đổi
!git add .

# Commit các thay đổi
!git commit -m "New commit"

# Push code lên GitHub
!git push origin main