In [2]:
%%writefile VLSP_Qwen_LoRA_Medical_Summarization.ipynb
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# VLSP Qwen LoRA Notebook\n",
        "# Fine-tuning Qwen với LoRA trên tập public_test.vi.txt (Medical Vietnamese)\n",
        "# Tác giả: Hoàng Minh\n",
        "# Mục tiêu: Tóm tắt văn bản y khoa tiếng Việt bằng Qwen + LoRA (PEFT)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 0. Cài đặt các thư viện cần thiết (chạy 1 lần)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "vscode": {
          "languageId": "shellscript"
        }
      },
      "outputs": [],
      "source": [
        "!pip install --upgrade pip\n",
        "!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu\n",
        "!pip install transformers accelerate peft datasets sentencepiece sacrebleu python-pptx sentence-transformers nltk"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 1. Import các thư viện"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "import re\n",
        "import json\n",
        "import random\n",
        "import math\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "from pathlib import Path\n",
        "\n",
        "from datasets import Dataset\n",
        "import nltk\n",
        "from nltk.tokenize import sent_tokenize\n",
        "\n",
        "# Transformers & PEFT\n",
        "from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer\n",
        "from peft import LoraConfig, get_peft_model\n",
        "\n",
        "# Evaluation\n",
        "import sacrebleu\n",
        "from sentence_transformers import SentenceTransformer, util\n",
        "\n",
        "# PPTX\n",
        "from pptx import Presentation\n",
        "from pptx.util import Inches, Pt"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 2. Cấu hình đường dẫn"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "DATA_PATH = Path('public_test.vi.txt')\n",
        "OUTPUT_DIR = Path('qwen_lora_output')\n",
        "OUTPUT_DIR.mkdir(exist_ok=True)\n",
        "\n",
        "SEED = 42\n",
        "random.seed(SEED)\n",
        "np.random.seed(SEED)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 3. Đọc dữ liệu"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "text = DATA_PATH.read_text(encoding='utf-8')\n",
        "text = text.replace('\\r\\n', '\\n')\n",
        "records = [r.strip() for r in re.split(r\"\\n\\s*\\n\", text) if r.strip()]\n",
        "print(f\"Số văn bản gốc: {len(records)}\")\n",
        "records[:3]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 4. Xem thử vài mẫu"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "for i, rec in enumerate(records[:5]):\n",
        "    print(f\"---- RECORD {i+1} ----\")\n",
        "    print(rec[:800])\n",
        "    print(\"\\n\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 5. Tạo pseudo-target (dùng câu đầu làm tóm tắt ngắn)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "nltk.download('punkt', quiet=True)\n",
        "\n",
        "examples = []\n",
        "for rec in records:\n",
        "    sents = sent_tokenize(rec, language='english')  # Vietnamese không có sẵn, dùng English tạm ổn\n",
        "    if len(sents) >= 2:\n",
        "        source = rec\n",
        "        target = sents[0].strip()\n",
        "        examples.append({'text': source, 'summary': target})\n",
        "\n",
        "print(f\"Số mẫu sau khi tạo pseudo-label: {len(examples)}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 6. Tạo HuggingFace Dataset + chia train/val"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "ds = Dataset.from_list(examples)\n",
        "train_test = ds.train_test_split(test_size=0.1, seed=SEED)\n",
        "train_ds = train_test['train']\n",
        "eval_ds = train_test['test']\n",
        "\n",
        "print(f\"Train: {len(train_ds)}, Eval: {len(eval_ds)}\")\n",
        "\n",
        "# Lưu preview\n",
        "(OUTPUT_DIR / 'train_preview.json').write_text(json.dumps(train_ds[:5], ensure_ascii=False, indent=2), encoding='utf-8')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 7. Load Qwen tokenizer & model (thay MODEL_NAME nếu cần)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "MODEL_NAME = os.getenv('QWEN_MODEL', 'Qwen/Qwen2.5-7B-Instruct')  # Thay bằng model bạn có quyền truy cập\n",
        "\n",
        "try:\n",
        "    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
        "    print(f\"Loaded tokenizer: {MODEL_NAME}\")\n",
        "except Exception as e:\n",
        "    print(\"Không load được Qwen, fallback về gpt2 để test\")\n",
        "    tokenizer = AutoTokenizer.from_pretrained('gpt2')\n",
        "\n",
        "if tokenizer.pad_token is None:\n",
        "    tokenizer.pad_token = tokenizer.eos_token"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 8. Preprocessing"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "max_input_length = 1024\n",
        "max_target_length = 128\n",
        "\n",
        "def preprocess_function(examples):\n",
        "    inputs = [f\"<|im_start|>system\\nBạn là một trợ lý y khoa tiếng Việt.<|im_end|>\\n<|im_start|>user\\nTóm tắt đoạn văn y khoa sau bằng tiếng Việt:\\n{t}<|im_end|>\\n<|im_start|>assistant\\n\" for t in examples['text']]\n",
        "    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=\"max_length\")\n",
        "    \n",
        "    with tokenizer.as_target_tokenizer():\n",
        "        labels = tokenizer(examples['summary'], max_length=max_target_length, truncation=True, padding=\"max_length\")\n",
        "    \n",
        "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
        "    return model_inputs\n",
        "\n",
        "train_tokenized = train_ds.map(preprocess_function, batched=True, remove_columns=['text','summary'])\n",
        "eval_tokenized = eval_ds.map(preprocess_function, batched=True, remove_columns=['text','summary'])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 9. Load model + LoRA"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "try:\n",
        "    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True, device_map=\"auto\")\n",
        "    model.resize_token_embeddings(len(tokenizer))\n",
        "except Exception as e:\n",
        "    print(\"Không load được model lớn, dùng gpt2 để demo\")\n",
        "    model = AutoModelForCausalLM.from_pretrained('gpt2')\n",
        "    model.resize_token_embeddings(len(tokenizer))\n",
        "\n",
        "# LoRA config\n",
        "peft_config = LoraConfig(\n",
        "    task_type=\"CAUSAL_LM\",\n",
        "    inference_mode=False,\n",
        "    r=8,\n",
        "    lora_alpha=32,\n",
        "    lora_dropout=0.1,\n",
        "    target_modules=[\"q_proj\", \"v_proj\"]\n",
        ")\n",
        "\n",
        "model = get_peft_model(model, peft_config)\n",
        "model.print_trainable_parameters()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 10. Trainer"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "training_args = TrainingArguments(\n",
        "    output_dir=str(OUTPUT_DIR / \"checkpoint\"),\n",
        "    per_device_train_batch_size=2,\n",
        "    per_device_eval_batch_size=2,\n",
        "    num_train_epochs=3,\n",
        "    logging_steps=50,\n",
        "    evaluation_strategy=\"epoch\",\n",
        "    save_strategy=\"epoch\",\n",
        "    learning_rate=2e-5,\n",
        "    weight_decay=0.01,\n",
        "    fp16=False,\n",
        "    bf16=False,\n",
        "    push_to_hub=False,\n",
        "    save_total_limit=2,\n",
        "    report_to=[]  # tắt wandb nếu không cần\n",
        ")\n",
        "\n",
        "trainer = Trainer(\n",
        "    model=model,\n",
        "    args=training_args,\n",
        "    train_dataset=train_tokenized,\n",
        "    eval_dataset=eval_tokenized,\n",
        "    tokenizer=tokenizer,\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 11. Huấn luyện (bỏ comment để chạy)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# trainer.train()\n",
        "# trainer.save_model(OUTPUT_DIR / \"final_model\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 12. Tạo báo cáo và slide"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def write_report():\n",
        "    md = f\"\"\"# VLSP Medical Domain — Fine-tuning Qwen với LoRA\n",
        "\n",
        "## Dữ liệu\n",
        "- Số mẫu train: {len(train_ds)}\n",
        "- Số mẫu eval: {len(eval_ds)}\n",
        "\n",
        "## Model\n",
        "- Base model: {MODEL_NAME}\n",
        "- LoRA: r=8, alpha=32\n",
        "\n",
        "## Kết quả\n",
        "- Chưa chạy đánh giá (cần chạy trainer.train() trước)\n",
        "\"\"\"\n",
        "    (OUTPUT_DIR / \"report.md\").write_text(md, encoding=\"utf-8\")\n",
        "\n",
        "def create_pptx():\n",
        "    prs = Presentation()\n",
        "    slide = prs.slides.add_slide(prs.slide_layouts[0])\n",
        "    slide.shapes.title.text = \"VLSP Medical — Fine-tuning Qwen với LoRA\"\n",
        "    slide.placeholders[1].text = \"Hoàng Minh\\nAuto-generated\"\n",
        "\n",
        "    s = prs.slides.add_slide(prs.slide_layouts[1])\n",
        "    s.shapes.title.text = \"Tổng quan\"\n",
        "    s.shapes.placeholders[1].text_frame.text = f\"Train: {len(train_ds)} mẫu\\nEval: {len(eval_ds)} mẫu\\nModel: {MODEL_NAME}\"\n",
        "\n",
        "    prs.save(OUTPUT_DIR / \"slides.pptx\")\n",
        "\n",
        "write_report()\n",
        "create_pptx()\n",
        "\n",
        "print(\"Đã tạo báo cáo và slide!\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Hoàn tất!\n",
        "Notebook đã sẵn sàng để chạy trên Google Colab, Kaggle hoặc máy có GPU.\n",
        "Chúc bạn đạt điểm cao VLSP!"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.10.12"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}

Writing VLSP_Qwen_LoRA_Medical_Summarization.ipynb


In [3]:
import os
print(os.listdir('.'))

['.config', 'VLSP_Qwen_LoRA_Medical_Summarization.ipynb', 'sample_data']
