In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2", 
    torch_dtype=torch.float16,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((256

In [5]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
from peft import LoraConfig, get_peft_model 

config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "fc1", "fc2"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 5242880 || all params: 2784926720 || trainable%: 0.1882591725788749


## Loading Datasets

In [8]:
from datasets import load_dataset

qa_dataset = load_dataset("themanas021/MATH-Algebra", split="train")

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

In [9]:
qa_dataset

Dataset({
    features: ['problem', 'level', 'type', 'solution'],
    num_rows: 100
})

In [10]:
# Function to transform the row into desired format
def create_prompt(question, answer):
    # question = row['problem']
    # answer = row['solution']
    formatted_string = f"[INST] {question} [/INST] {answer} "
    return formatted_string

mapped_qa_dataset = qa_dataset.map(lambda samples: tokenizer(create_prompt(samples['problem'], samples['solution'])))

In [11]:
mapped_qa_dataset

Dataset({
    features: ['problem', 'level', 'type', 'solution', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [12]:
import pandas as pd
df = pd.DataFrame(mapped_qa_dataset)
df.head(2)

Unnamed: 0,problem,level,type,solution,input_ids,attention_mask
0,"Let \[f(x) = \left\{\n\begin{array}{cl} ax+3, ...",Level 5,Algebra,"For the piecewise function to be continuous, t...","[58, 38604, 60, 3914, 3467, 58, 69, 7, 87, 8, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,A rectangular band formation is a formation wi...,Level 5,Algebra,Let $x$ be the number of band members in each ...,"[58, 38604, 60, 317, 36954, 4097, 9978, 318, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## Train Lora

In [13]:
mapped_qa_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])


In [15]:
# Assign a padding token if it is not set
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Verify tokenizer configuration
print(f"Padding token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")


Padding token: [PAD], ID: 50295


In [17]:
import transformers

trainer = transformers.Trainer(
    model=model, 
    train_dataset=mapped_qa_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4, 
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=100,
        learning_rate=1e-3, 
        fp16=True,
        logging_steps=1, 
        output_dir='outputs',
        num_train_epochs = 3.0
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Step,Training Loss
1,1.3341
2,1.4658
3,1.4029
4,1.2002
5,1.4883
6,1.2767
7,1.6628
8,1.2993
9,1.4946
10,1.4339


TrainOutput(global_step=100, training_loss=0.7716015933454037, metrics={'train_runtime': 569.9099, 'train_samples_per_second': 2.807, 'train_steps_per_second': 0.175, 'total_flos': 9069622813532160.0, 'train_loss': 0.7716015933454037, 'epoch': 14.32})

In [18]:
HUGGING_FACE_USER_NAME = "alam1n"

In [19]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
model_name = "phi-2-cse499a"

model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)



adapter_model.safetensors:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alam1n/phi-2-cse499a/commit/5fa2dc5aaa9e76866df82a05a8b55feb8f573776', commit_message='Upload model', commit_description='', oid='5fa2dc5aaa9e76866df82a05a8b55feb8f573776', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alam1n/phi-2-cse499a', endpoint='https://huggingface.co', repo_type='model', repo_id='alam1n/phi-2-cse499a'), pr_revision=None, pr_num=None)

In [21]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = f"{HUGGING_FACE_USER_NAME}/{model_name}"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
qa_model = PeftModel.from_pretrained(model, peft_model_id)

adapter_config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


adapter_model.safetensors:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [24]:
qa_model.config.pad_token_id = qa_model.config.eos_token_id  # Set pad_token_id to eos_token_id explicitly


In [27]:
from IPython.display import display, Markdown

def make_inference(question):
    batch = tokenizer(f"### Question\n{question}\n\n### ANSWER\n", return_tensors='pt').to("cuda")

    with torch.cuda.amp.autocast():
        output_tokens = qa_model.generate(**batch, max_new_tokens=200)

    display(Markdown((tokenizer.decode(output_tokens[0], skip_special_tokens=True))))


In [28]:
question = "answer the following question What 2+2?"

make_inference(question)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


### Question
answer the following question What 2+2?

### ANSWER
4. 

We can use the addition chain to add 2 and 2.  First, add 1 and 1 to get 2.  Then add 2 and 1 to get 3.  Finally, add 3 and 2 to get 5.  Adding a second 2 to each number gives us $5 + 4 = \boxed{9}$. 

We could also use the number line to add 2 and 2.  Starting at 2, we can add 2 to get 4 and then add 2 more to get 6.  The answer is $\boxed{6}$. 

We could also use mental math.  Adding 2 and 2 is the same as adding 1 and 1 and then adding 1 more, so the answer is $\boxed{3}$. 

We could also use the distributive property.  We can rewrite $2+2+2+2$ as $(2+2)(2+1)$.  Adding 2 and 2

In [30]:
question = "A rectangular band formation is a formation with $m$ band members in each of $r$ rows, where $m$ and $r$ are integers. A particular band has less than 100 band members. The director arranges them in a rectangular formation and finds that he has two members left over. If he increases the number of members in each row by 1 and reduces the number of rows by 2, there are exactly enough places in the new formation for each band member. What is the largest number of members the band could have?"
make_inference(question)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


### Question
A rectangular band formation is a formation with $m$ band members in each of $r$ rows, where $m$ and $r$ are integers. A particular band has less than 100 band members. The director arranges them in a rectangular formation and finds that he has two members left over. If he increases the number of members in each row by 1 and reduces the number of rows by 2, there are exactly enough places in the new formation for each band member. What is the largest number of members the band could have?

### ANSWER
Let $x$ be the number of members in each row for the original formation.  Then, we have two equations: $$rx+2=100, (r-2)(x+1)=100.$$  Simplifying the first equation gives $r=98/x$.  Substituting into the second equation gives $(r-2)(x+1)=98,$ so $x+r-2=14$.  We want $r$ and $x$ to be integers, so we try some values for $r$ and calculate $x$.  If $r=10$, then $x=12$, which works.  Checking back in the formation, we see that it satisfies the conditions.  Therefore, the largest number of members the band could have is $\boxed{12}$. 

Challenge: See if you can find a quick solution to this problem by simply thinking about the two formations. 

---

In [32]:
question = "If $x = 2$ and $y = 5$, then what is the value of $\frac{x^4+2y^2}{6}$ ?"
make_inference(question)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


### Question
If $x = 2$ and $y = 5$, then what is the value of $rac{x^4+2y^2}{6}$?

### ANSWER
We have  \begin{align*}
x^4+2y^2&=2^4+2(5^2)=16+2(25)=16+50=\boxed{66}
\end{align*}Therefore,  \begin{align*}
\frac{x^4+2y^2}{6}&=\frac{\boxed{66}}{6}=\boxed{11}.
\end{align*} 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
