In [1]:
!pip install -qqq torch==2.0.1 --progress-bar off
!pip install -qqq transformers==4.32.1 --progress-bar off
!pip install -qqq datasets==2.14.4 --progress-bar off
!pip install -qqq peft==0.5.0 --progress-bar off
!pip install -qqq bitsandbytes==0.41.1 --progress-bar off
!pip install -qqq trl==0.7.1 --progress-bar off

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for lit (pyproject.toml) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.1.0+cu118 requires torch==2.1.0, but you have torch 2.0.1 which is incompatible.
torchdata 0.7.0 requires torch==2.1.0, but you have torch 2.0.1 which is incompatible.
torchtext 0.16.0 requires torch==2.1.0, but you have torch 2.0.1 which is incompatible.
torchvision 0.16.0+cu118 requires torch==2.1.0, but you have torch 2.0.1 which is incompatible.[0m[31m
[0m

In [2]:
import re
import json
from pprint import pprint

import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import login
from peft import LoraConfig, PeftModel
from transformers import(
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import SFTTrainer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"

In [3]:
base_url = "https://raw.githubusercontent.com/NLP7-LegalEagle/LegalEagle-Dataset/main/instruction_datasets/"
dataset = load_dataset("csv", data_files={"train": base_url + "dataset_train.csv",
                                           "validation": base_url + "dataset_validation.csv",
                                            "test": base_url + "dataset_test.csv"})

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/20.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.58M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        use_cache=False,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map={"": 0},
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

In [6]:
base_model, tokenizer = create_model_and_tokenizer()
base_model.config.use_cahce = False

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [7]:
import gc
torch.cuda.empty_cache()
gc.collect()

109

In [8]:
lora_alpha = 64
lora_dropout = 0.05
lora_r = 16

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [9]:
output_dir = "LE_Project"

In [13]:
training_arguments = TrainingArguments(
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    max_steps=1000,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm = 0.3,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=0.2,
    warmup_ratio=0.5,
    save_strategy="steps",
    group_by_length=True,
    output_dir=output_dir,
    save_safetensors=True,
    lr_scheduler_type="linear",
    seed=42,
)

In [10]:
import os
from transformers import TrainerCallback

In [11]:
class PeftSavingCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
        kwargs["model"].save_pretrained(checkpoint_path)

        if "LegalEagle.bin" in os.listdir(checkpoint_path):
            os.remove(os.path.join(checkpoint_path, "LegalEagle.bin"))
callbacks = [PeftSavingCallback()]

In [14]:
trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    peft_config=peft_config,
    dataset_text_field= "result",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    callbacks=callbacks,
)



Map:   0%|          | 0/52409 [00:00<?, ? examples/s]

Map:   0%|          | 0/6551 [00:00<?, ? examples/s]

In [15]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
200,2.1247,1.543139
400,1.8759,1.478206


Step,Training Loss,Validation Loss
200,2.1247,1.543139
400,1.8759,1.478206
600,1.9082,1.455954
800,1.8661,1.443032
1000,1.8608,1.437475


TrainOutput(global_step=1000, training_loss=1.6160487890243531, metrics={'train_runtime': 9853.8706, 'train_samples_per_second': 1.624, 'train_steps_per_second': 0.101, 'total_flos': 1.1137047123915571e+17, 'train_loss': 1.6160487890243531, 'epoch': 0.31})

In [16]:
trainer.save_model()

In [17]:
trainer.model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linea

In [None]:
# !git lfs install
# !git clone https://panwoo00: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@huggingface.co/panwoo00/LE_project


In [None]:
# from peft import PeftModel, PeftConfig, AutoPeftModelForCausalLM
# from transformers import AutoModelForCausalLM

# config = PeftConfig.from_pretrained("panwoo00/LE_project")
# model = AutoPeftModelForCausalLM.from_pretrained("panwoo00/LE_project")

In [18]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage = True,
)

model  = model.merge_and_unload()
model.save_pretrained("merged_model", safe_serialization=True)
tokenizer.save_pretrained("merged_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('merged_model/tokenizer_config.json',
 'merged_model/special_tokens_map.json',
 'merged_model/tokenizer.json')

In [19]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_

In [20]:
model.push_to_hub(
    "panwoo00/LE_project2",
    use_temp_dir=True,
    use_auth_token="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"
)



pytorch_model-00001-of-00003.bin:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/panwoo00/LE_project2/commit/1adaaca847c221d2cc4093501286ea078b9da183', commit_message='Upload LlamaForCausalLM', commit_description='', oid='1adaaca847c221d2cc4093501286ea078b9da183', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
from transformers import pipeline

In [22]:
test = dataset['test']['result']

In [23]:
test[0]

'### Instruction:\n    Use the Input below to create an instruction, which could have been used to generate the input using an LLM.\n\n    ### Input:\n    was acting to obtain a benefit on behalf of a charitable ... organization.” U.S.S.G. § 2B1.1 cmt. 8(B). As the district court saw it and as the government sees it, Webster deserves the enhancement. He pretended to “act[ ] on behalf of a charitable ... organization,” U.S.S.G. § 2Bl.l(b)(9)(A), when he solicited personal information from the victims on behalf of fake charities. As Webster sees it, the enhancement does not apply. In his view, the commentary limits the application of the charity enhancement, and he was not acting to obtain a benefit on behalf of a charitable organization (as the commentary seems to require). As a general matter, the text of a guideline trumps commentary about it. See Stinson v. United States, 508 U.S. 36, 38, 113 S.Ct. 1913, 123 L.Ed.2d 598 (1993) (<HOLDING>). But we need not resolve whether the\n\n    #

In [24]:
prompt = "was acting to obtain a benefit on behalf of a charitable ... organization.” U.S.S.G. § 2B1.1 cmt. 8(B). As the district court saw it and as the government sees it, Webster deserves the enhancement. He pretended to “act[ ] on behalf of a charitable ... organization,” U.S.S.G. § 2Bl.l(b)(9)(A), when he solicited personal information from the victims on behalf of fake charities. As Webster sees it, the enhancement does not apply. In his view, the commentary limits the application of the charity enhancement, and he was not acting to obtain a benefit on behalf of a charitable organization (as the commentary seems to require). As a general matter, the text of a guideline trumps commentary about it. See Stinson v. United States, 508 U.S. 36, 38, 113 S.Ct. 1913, 123 L.Ed.2d 598 (1993) (<HOLDING>). But we need not resolve whether the"

In [25]:
# before fine tuning
pipe = pipeline(task="text-generation",
                model=base_model,
                tokenizer=tokenizer,
)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] was acting to obtain a benefit on behalf of a charitable ... organization.” U.S.S.G. § 2B1.1 cmt. 8(B). As the district court saw it and as the government sees it, Webster deserves the enhancement. He pretended to “act[ ] on behalf of a charitable ... organization,” U.S.S.G. § 2Bl.l(b)(9)(A), when he solicited personal information from the victims on behalf of fake charities. As Webster sees it, the enhancement does not apply. In his view, the commentary limits the application of the charity enhancement, and he was not acting to obtain a benefit on behalf of a charitable organization (as the commentary seems to require). As a general matter, the text of a guideline trumps commentary about it. See Stinson v. United States, 508 U.S. 36, 38, 113 S.Ct. 1913, 123 L.Ed.2d 598 (1993) (<HOLDING>). But we need not resolve whether the [/INST]  I agree with the government that the district court erred in failing to apply the enhancement for Webster’s conduct. The enhancement is availabl

In [26]:
# after
pipe = pipeline(task="text-generation",
                model=model,
                tokenizer=tokenizer,
)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] was acting to obtain a benefit on behalf of a charitable ... organization.” U.S.S.G. § 2B1.1 cmt. 8(B). As the district court saw it and as the government sees it, Webster deserves the enhancement. He pretended to “act[ ] on behalf of a charitable ... organization,” U.S.S.G. § 2Bl.l(b)(9)(A), when he solicited personal information from the victims on behalf of fake charities. As Webster sees it, the enhancement does not apply. In his view, the commentary limits the application of the charity enhancement, and he was not acting to obtain a benefit on behalf of a charitable organization (as the commentary seems to require). As a general matter, the text of a guideline trumps commentary about it. See Stinson v. United States, 508 U.S. 36, 38, 113 S.Ct. 1913, 123 L.Ed.2d 598 (1993) (<HOLDING>). But we need not resolve whether the [/INST]  The defendant in this case, Webster, was convicted of aggravated identity theft and mail fraud. The government moved for an enhancement of Webst