In [1]:
!pip install transformers
!pip install accelerate
!pip install datasets
!pip install bitsandbytes
!pip install trl
!pip install py7zr
!pip install auto-gptq
!pip install optimum

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.8 

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load Dataset

In [3]:
from datasets import load_dataset

data = load_dataset("samsum")
data

Downloading data:   0%|          | 0.00/6.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/335k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [4]:
data["train"]["dialogue"][0]

"Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)"

In [5]:
data["train"]["summary"][0]

'Amanda baked cookies and will bring Jerry some tomorrow.'

In [6]:
def generate_text(batch):
    return {"text": "###Human: Summarize this following dialogue: " + batch["dialogue"] + "\n###Assistant: " + batch["summary"]}

In [7]:
data = data.map(generate_text)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [8]:
data["train"]["text"][0]

"###Human: Summarize this following dialogue: Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)\n###Assistant: Amanda baked cookies and will bring Jerry some tomorrow."

In [34]:
model_ckpt_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"

### Tokenizer

In [35]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_ckpt_path)
tokenizer.pad_token = tokenizer.eos_token

### GPTQ Config

In [36]:
from transformers import GPTQConfig

quantization_config = GPTQConfig(bits=4,
                                 disable_exllama=True,
                                 tokenizer=tokenizer)

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


### Causal LM Model

In [37]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(model_ckpt_path,
                                             quantization_config=quantization_config,
                                             device_map="auto")

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. use_exllama, exllama_config, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


In [38]:
model.config.use_cache=False
model.config.pretraining_tp=1
model.gradient_checkpointing_enable()

### Peft Config

In [39]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [40]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(r=16,
                         lora_alpha=16,
                         lora_dropout=0.05,
                         bias="none",
                         task_type="CAUSAL_LM",
                         target_modules=["q_proj", "v_proj"])

In [41]:
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 269,225,984 || trainable%: 2.5316070532033046


### Training Arguments

In [42]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(output_dir="mistral-finetuned-samsum",
                                       per_device_train_batch_size=8,
                                       per_device_eval_batch_size=8,
                                       gradient_accumulation_steps=1,
                                       optim="paged_adamw_32bit",
                                       learning_rate=2e-4,
                                       lr_scheduler_type="cosine",
                                       save_strategy="epoch",
                                       evaluation_strategy="steps",
                                       eval_steps=500,
                                       logging_steps=100,
                                       num_train_epochs=1,
                                       max_steps=250,
                                       fp16=True)

### Trainer

In [43]:
from trl import SFTTrainer

trainer = SFTTrainer(model=model,
                     train_dataset=data["train"],
                     eval_dataset=data["validation"],
                     peft_config=peft_config,
                     dataset_text_field="text",
                     args=training_arguments,
                     tokenizer=tokenizer,
                     packing=False,
                     max_seq_length=512)

Map:   0%|          | 0/818 [00:00<?, ? examples/s]



In [44]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=250, training_loss=1.804910614013672, metrics={'train_runtime': 2608.4499, 'train_samples_per_second': 0.767, 'train_steps_per_second': 0.096, 'total_flos': 664670974574592.0, 'train_loss': 1.804910614013672, 'epoch': 0.14})

In [45]:
trainer.push_to_hub()

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

events.out.tfevents.1704032455.b29e341fa2b2.235.1:   0%|          | 0.00/4.82k [00:00<?, ?B/s]

events.out.tfevents.1704032265.b29e341fa2b2.235.0:   0%|          | 0.00/4.82k [00:00<?, ?B/s]

events.out.tfevents.1704032664.b29e341fa2b2.235.2:   0%|          | 0.00/5.49k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

'https://huggingface.co/MohamedAhmedAE/mistral-finetuned-samsum/tree/main/'

### Inference

In [47]:
from time import perf_counter
import torch
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("MohamedAhmedAE/mistral-finetuned-samsum")
inputs = tokenizer("""
###Human: Summarize this following dialogue: Vasanth: I'm at the railway station in Chennai Karthik: No problems so far? Vasanth: no, everything's going smoothly Karthik: good. lets meet there soon!
###Assistant:
""", return_tensors="pt").to("cuda")

tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

In [52]:
model = AutoPeftModelForCausalLM.from_pretrained("MohamedAhmedAE/mistral-finetuned-samsum",
                                                 low_cpu_mem_usage=True,
                                                 return_dict=True,
                                                 torch_dtype=torch.float16,
                                                 device_map="cuda")

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

In [54]:
generation_config = GenerationConfig(do_sample=True,
                                     top_k=1,
                                     temperature=0.1,
                                     max_new_tokens=25,
                                     pad_token_id=tokenizer.eos_token_id)

In [55]:
start_time = perf_counter()
outputs = model.generate(**inputs, generation_config=generation_config)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print(f"Inference take : {perf_counter()-start_time}")


###Human: Summarize this following dialogue: Vasanth: I'm at the railway station in Chennai Karthik: No problems so far? Vasanth: no, everything's going smoothly Karthik: good. lets meet there soon!
###Assistant: 
###Summary: Vasanth is at the railway station in Chennai. Everything is going smoothly. Karthik will meet
Inference take : 5.886077391000072
