# Installing Dependencies

In [None]:
!pip install wandb

In [None]:
! pip install accelerate transformers einops datasets peft bitsandbytes

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Importing Dependencies

In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import os

# Finetuning

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-1_5",
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)

In [None]:
model

In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 4,718,592 || all params: 1,422,989,312 || trainable%: 0.3315971497613047


In [None]:
print(model)

In [None]:
def tokenize(sample):
    model_inps =  tokenizer(sample["text"], padding=True, truncation=True, max_length=512)
    return model_inps

In [None]:
data = load_dataset("gsm8k", "main", split="train")
data_df = data.to_pandas()
data_df["text"] = data_df[["question", "answer"]].apply(lambda x: "question: " + x["question"] + " answer: " + x["answer"], axis=1)
data = Dataset.from_pandas(data_df)
tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)
tokenized_data

Downloading builder script:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/242k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Tokenizing data:   0%|          | 0/7473 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 7473
})

In [None]:
import wandb
wandb.login(key="xyz")
run = wandb.init(project='Fine tuning Phi', job_type="training", anonymous="allow")

In [None]:
training_arguments = TrainingArguments(
        "Microsoft_Phi_gsm8k",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=100,
        max_steps=1000,
        num_train_epochs=1,
        report_to="wandb",
        push_to_hub=True
    )


In [None]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_data,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()

Step,Training Loss
100,0.9033
200,0.9089
300,0.8859
400,0.9318
500,0.9168
600,0.959
700,0.9565
800,0.9915
900,0.9856
1000,1.0077


TrainOutput(global_step=1000, training_loss=0.9447008438110351, metrics={'train_runtime': 362.0735, 'train_samples_per_second': 11.047, 'train_steps_per_second': 2.762, 'total_flos': 1.324548377174016e+16, 'train_loss': 0.9447008438110351, 'epoch': 0.54})

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM
import torch
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, torch_dtype=torch.float32)
peft_model = PeftModel.from_pretrained(model, "hamadandrabi/Microsoft_Phi_gsm8k", from_transformers=True)
model = peft_model.merge_and_unload()
model

MixFormerSequentialForCausalLM(
  (layers): Sequential(
    (0): Embedding(
      (wte): Embedding(51200, 2048)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (1): ParallelBlock(
      (ln): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (resid_dropout): Dropout(p=0.0, inplace=False)
      (mixer): MHA(
        (rotary_emb): RotaryEmbedding()
        (Wqkv): Linear(in_features=2048, out_features=6144, bias=True)
        (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
        (inner_attn): SelfAttention(
          (drop): Dropout(p=0.0, inplace=False)
        )
        (inner_cross_attn): CrossAttention(
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (mlp): MLP(
        (fc1): Linear(in_features=2048, out_features=8192, bias=True)
        (fc2): Linear(in_features=8192, out_features=2048, bias=True)
        (act): NewGELUActivation()
      )
    )
    (2): ParallelBlock(
      (ln): LayerNorm((2048,), eps=1e-05, elementwis

In [None]:
model.push_to_hub("hamadandrabi/Microsoft_Phi_gsm8k")

pytorch_model.bin:   0%|          | 0.00/5.67G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hamadandrabi/Microsoft_Phi_gsm8k/commit/81a129f032ec62d6f2450c9d7b057b0674d4691b', commit_message='Upload MixFormerSequentialForCausalLM', commit_description='', oid='81a129f032ec62d6f2450c9d7b057b0674d4691b', pr_url=None, pr_revision=None, pr_num=None)

In [None]:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("hamadandrabi/Microsoft_Phi_gsm8k", trust_remote_code=True, torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
inputs = tokenizer('''question: I had $20. I paid $15 as rent. How much money do I have left? answer: ''', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=512)
text = tokenizer.batch_decode(outputs)[0]
print(text)


question: I had $20. I paid $15 as rent. How much money do I have left? answer: 
I have $20 - $15 = $<<20-15=5>>5 left.
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 5
#### 