<a href="https://colab.research.google.com/github/MaSaKaIV/QAwithOwnLLM/blob/main/test_llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Start up
---
### Installation

In [None]:
# パッケージのインストール
!pip install llama-index
!pip install transformers accelerate bitsandbytes
!pip install sentencepiece einops sentence_transformers
!pip install git+https://github.com/huggingface/peft.git
!pip install datasets

### LLM

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from langchain.llms import HuggingFacePipeline
import torch

model_name = "stabilityai/japanese-stablelm-base-alpha-7b"

model_kwargs = {
    "trust_remote_code": True,
    "device_map": "auto",
    "low_cpu_mem_usage": True,
    "variant": "int8",
    "load_in_8bit":True
}

# トークナイザーとモデルの準備
tokenizer = AutoTokenizer.from_pretrained(
    "novelai/nerdstash-tokenizer-v1",
    use_fast=False,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    **model_kwargs
)

model.eval()

### Embedding Model

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding
from typing import Any, List

# query付きのHuggingFaceEmbeddings
class HuggingFaceQueryEmbeddings(HuggingFaceEmbeddings):
    def __init__(self, **kwargs: Any):
        super().__init__(**kwargs)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return super().embed_documents(["query: " + text for text in texts])

    def embed_query(self, text: str) -> List[float]:
        return super().embed_query("query: " + text)

# 埋め込みモデルの準備
embed_model = LangchainEmbedding(
    HuggingFaceQueryEmbeddings(model_name="intfloat/multilingual-e5-large")
)

# Instruction Tuning
---
### Tokenize datasets

In [None]:
import datasets

dataset_name = "kunishou/databricks-dolly-15k-ja"
dataset = datasets.load_dataset(dataset_name)

data_max_length = 512 #VRAM消費量を減らすため512トークンに制限

prompt_with_context_format = """The following text is the task instruction and the context for it.
Write a response that satisfies the instruction based on context.

### Instruction:
{instruction}

### Context:
{context}

### Response:
{response}
"""

prompt_no_context_format = """The following text is the task instruction.
Write a response that satisfies the instruction based on context.

### Instruction:
{instruction}

### Response:
{response}
"""

def tokenize(samples):
    prompts = []

    # データセットの instruction 列と input 列と output 列を組み合わせてプロンプトを組み立てます。
    for instruction, input, output in zip(samples["instruction"], samples["input"], samples["output"]):
        if input:
            prompt = prompt_with_context_format.format(instruction=instruction, context=input, response=output)
        else:
            prompt = prompt_no_context_format.format(instruction=instruction, response=output)
        prompts.append(prompt + tokenizer.eos_token)

    result = tokenizer(prompts, padding=False, truncation=True, max_length=data_max_length)
    return result

dataset = dataset.map(lambda samples: tokenize(samples), batched=True)

### Tuning

In [None]:
# ベースモデルをフリーズ

for param in model.parameters():
    param.requires_grad = False
    if param.ndim == 1:
        param.data = param.data.to(torch.float32)

# VRAM消費量を節約するための調整

model.gradient_checkpointing_enable()

model.enable_input_require_grads()

class CastOutputToFloat(torch.nn.Sequential):
   def forward(self, x):
      return super().forward(x).to(torch.float32)

model.embed_out = CastOutputToFloat(model.embed_out)

print(model)

### Train

In [None]:
# PEFTを用いたLoRA学習の設定

import peft
from transformers import TrainingArguments, DataCollatorForLanguageModeling, Trainer

peft_config = peft.LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    fan_in_fan_out=False,
    task_type=peft.TaskType.CAUSAL_LM
)
model = peft.get_peft_model(model, peft_config)

model.print_trainable_parameters()

# 学習

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=20,
    max_steps=200,
    learning_rate=2e-4,
    fp16=True,
    num_train_epochs=1,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=10,
    output_dir=".checkpoints",
    evaluation_strategy="no",
    logging_dir="logs",
    logging_steps=25,
    gradient_checkpointing=True,
    push_to_hub=False
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    args=training_args,
    data_collator=data_collator,
)

model.config.use_cache = False

# 学習を途中から再開する場合はここへチェックポイント名を記入します。
checkpoint = None
# checkpoint = "checkpoint-100"

trainer.train(checkpoint)

model.save_pretrained("output")

### Wrap llama custom LLM class

In [None]:
import torch
from transformers import pipeline
from typing import Optional, List, Mapping, Any

from llama_index import (
    ServiceContext,
    SimpleDirectoryReader,
    LangchainEmbedding,
    ListIndex
)
from llama_index.callbacks import CallbackManager
from llama_index.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.llms.base import llm_completion_callback

# set context window size
context_window = 2048
# set number of output tokens
num_output = 256

class OurLLM(CustomLLM):

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=context_window,
            num_output=num_output,
            model_name=model_name
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        tokens = model.generate(
            input_ids=torch.tensor(["input_ids"]).to(device=model.device),
            max_new_tokens=128,
            temperature=0.8,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )

        text = tokenizer.decode(tokens[0], skip_special_tokens=False)
        return CompletionResponse(text=text)

    @llm_completion_callback()
    def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
        raise NotImplementedError()

llm = OurLLM()

# Predict
---
### Load Documents

In [None]:
import os
from langchain.document_loaders import PyPDFLoader

data_folder = "./data/"
loader = PyPDFLoader(os.path.join(data_folder, "sample.pdf"))
documents = loader.load_and_split()

### Node Parser

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.node_parser import SimpleNodeParser

# チャンクの分割
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=514-7,  # チャンクの最大文字数
    chunk_overlap=20,  # オーバーラップの最大文字数
)

# ノードパーサーの準備
node_parser = SimpleNodeParser(text_splitter=text_splitter)

### Service Context

In [None]:
from llama_index import ServiceContext

# サービスコンテキストの準備
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    node_parser=node_parser,
)

### Make Index

In [None]:
from llama_index import VectorStoreIndex

# インデックスの作成
index = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context,
)

### QA Template

In [None]:
from llama_index.prompts.prompts import QuestionAnswerPrompt

# QAテンプレートの準備
qa_template = QuestionAnswerPrompt("""以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。

### 指示:
{query_str}

### 入力:
{context_str}

### 応答:
""")

### Question and Answer session

In [None]:
# クエリエンジンの作成
query_engine = index.as_query_engine(
    similarity_top_k=3,
    text_qa_template=qa_template,
)

# 質問応答
query_engine.query("クラウドセキュリティについて50文字でまとめて")