In [None]:
! pip install accelerate
! pip install transformers
! pip install einops
! pip install datasets
! pip install peft
! pip install bitsandbytes

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/265.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/265.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "caching_allocator"

### Load Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("gsm8k", "main")
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [None]:
print("question -> ", dataset["train"]["question"][0])
print("answer -> ", dataset["train"]["answer"][0])

question ->  Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
answer ->  Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72


In [None]:
train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()
train_df

Unnamed: 0,question,answer
0,Natalia sold clips to 48 of her friends in Apr...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...
1,Weng earns $12 an hour for babysitting. Yester...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...
2,Betty is saving money for a new wallet which c...,"In the beginning, Betty has only 100 / 2 = $<<..."
3,"Julie is reading a 120-page book. Yesterday, s...",Maila read 12 x 2 = <<12*2=24>>24 pages today....
4,James writes a 3-page letter to 2 different fr...,He writes each friend 3*2=<<3*2=6>>6 pages a w...
...,...,...
7468,"Very early this morning, Elise left home in a ...","For the distance she traveled, Elise paid 23 -..."
7469,Josh is saving up for a box of cookies. To rai...,He makes $.5 profit on each bracelet because 1...
7470,Colin can skip at six times the speed that Bra...,Tony can skip at twice the speed that Bruce ca...
7471,"Janet, a third grade teacher, is picking up th...",Janet needs 35 lunches for the kids + 5 for th...


### Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def tokenize(sample):
    input_ids = tokenizer(sample["text"], padding=True, truncation=True, max_length=512)
    return input_ids

In [None]:
from datasets import Dataset

def tokenize_dataset_df(tokenizer, data_df):
    data_df["text"] = data_df[["question", "answer"]].apply(lambda x: "question: " + x["question"] + " answer: " + x["answer"], axis=1)
    data = Dataset.from_pandas(data_df)
    tokenized_data = data.map(tokenizer, batched=True, desc="Tokenizing data", remove_columns=data.column_names)
    return tokenized_data

In [None]:
train_tokenized = tokenize_dataset_df(tokenize, train_df)
test_tokenized = tokenize_dataset_df(tokenize, test_df)
train_tokenized

Tokenizing data:   0%|          | 0/7473 [00:00<?, ? examples/s]

Tokenizing data:   0%|          | 0/1319 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 7473
})

### Data Collator

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

### Model Bits Config

In [None]:
import torch
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

### Causal LM Model

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2",
                                             device_map={"":0},
                                             trust_remote_code=True,
                                             quantization_config=bnb_config)

model

### Lora Config

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(r=16,
                    lora_alpha=16,
                    target_modules=["Wqkv", "out_proj"],
                    lora_dropout=0.05,
                    bias="none",
                    task_type="CAUSAL_LM")

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 7,864,320 || all params: 2,787,548,160 || trainable%: 0.2821231974697076


In [None]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (transformer): PhiModel(
        (embd): Embedding(
          (wte): Embedding(51200, 2560)
          (drop): Dropout(p=0.0, inplace=False)
        )
        (h): ModuleList(
          (0-31): 32 x ParallelBlock(
            (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
            (resid_dropout): Dropout(p=0.1, inplace=False)
            (mixer): MHA(
              (rotary_emb): RotaryEmbedding()
              (Wqkv): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=7680, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=7680, b

### Trainer & Training Arguments

In [None]:
from transformers import TrainingArguments, Trainer

model_output_dir = "phi-2-finetuned-gsm8k"

training_arguments = TrainingArguments(output_dir=model_output_dir,
                                       per_device_train_batch_size=2,
                                       per_device_eval_batch_size=2,
                                       learning_rate=2e-4,
                                       lr_scheduler_type="cosine",
                                       save_strategy="epoch",
                                       evaluation_strategy='steps',
                                       eval_steps=500,
                                       logging_steps=100,
                                       max_steps=1000,
                                       num_train_epochs=1,
                                       gradient_accumulation_steps=1,
                                       push_to_hub=True)

In [None]:
trainer = Trainer(model=model,
                  train_dataset=train_tokenized,
                  eval_dataset=test_tokenized,
                  args=training_arguments,
                  data_collator=data_collator)

### Start Training

In [None]:
trainer.train()

OutOfMemoryError: ignored

### Push to hub

In [None]:
trainer.push_to_hub()

### Saving

In [None]:
# from peft import PeftModel
# from transformer import AutoModelForCausalLModel
# import torch

# model = AutoModelForCausalLModel.from_pretrained("microsoft/phi-2", trust_remote_code=True, torch_dtype=torch.float32)
# peft_model = PeftModel.from_pretrined(model, "MohamedAhmedAE/phi-2-finetuned-gsm8k", from_transformers=True)
# model = peft_model.merge_and_unload()
# model

In [None]:
# model.push_to_hub("MohamedAhmedAE/phi-2-finetuned-gsm8k")

### Inference

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("MohamedAhmedAE/phi-2-finetuned-gsm8k", trust_remote_code=True, torch_dtype=torch.float32)


In [None]:
inputs = tokenizer(
    """
    question: I have 2 apples. My friend gave me another
    two apples. I ate 1 apple. Totally how many I have now? answer:
    """,
    return_tensors="pt",
    return_attention_mask=False)

In [None]:
outputs = model.generate(**inputs, max_length=512)
text = tokenizer.batch_decode(outputs)[0]
print(text)