### Notebook Preparation
Install packages, download dataset



In [None]:
# if shell commands don't work in Colab
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
# install requirements
!pip3 install virtualenv
!virtualenv trelisEnv
!python -m pip install --upgrade pip -q
!pip install transformers==4.38.1 -q -U
!transformers-cli env
!pip install bitsandbytes==0.42.0 -q -U
!pip install peft==0.8.2 -q -U
!pip install accelerate==0.27.2 -q -U
!pip install flash-attn==2.5.5 -q -U
!pip install datasets==2.17.1 -q -U
!pip install scipy==1.12.0 -q -U
!pip install trl==0.7.11 -q -U
!pip install hf_transfer==0.1.5 -q -U
!pip install huggingface_hub==0.20.3 -q -U
!pip install -q diffusers transformers accelerate peft bitsandbytes
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install flash-attn --upgrade
!pip install "unsloth[cu121] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps packaging ninja einops flash-attn xformers git+https://github.com/huggingface/trl.git peft accelerate bitsandbytes -q

In [None]:
!source /content/trelisEnv/bin/activate

In [None]:
# download dataset
!git clone https://github.com/mathllm/MathCoder

#### Login into HuggingFace

In [None]:
from huggingface_hub import notebook_login
notebook_login()

#### Import libs

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, TextStreamer, Trainer
from trl import SFTTrainer
from unsloth import FastLanguageModel
from peft import PeftModel
from datasets import Dataset, load_dataset
from dataclasses import dataclass, field

from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable() #Comment this in to save on VRAM
model = prepare_model_for_kbit_training(model) # only set this if using quantization.
from peft import LoraConfig, get_peft_model

import gc  # import Python's garbage collection module
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import json
from typing import Dict, Optional

### Prepare Datasets
Read data from JSON

Split Test/Train datasets

In [None]:
def transform_line(line):
    data = json.loads(line)
    transformed_messages = []
    for message in data["messages"]:
        if message["role"] == "user":
            for content in message["content"]:
                if content["type"] == "text":
                    user_message = {
                        "role": "user",
                        "content": content["content"]
                    }
                    transformed_messages.append(user_message)
    if "ground_truth" in data:
        assistant_message = {
            "role": "assistant",
            "content": f"{data['ground_truth']['solution']} Answer: {data['ground_truth']['answer']}"
        }
        transformed_messages.append(assistant_message)
    return transformed_messages

##### Train data

In [None]:
# Read from a file, transform each line, and write to a new file
input_path = '/content/MathCoder/data/MATH_train_post.jsonl'   # Path to the input JSON file
# output_path = 'output.json' # Path to the output JSON file
dataset = []
with open(input_path, 'r', encoding='utf-8') as infile:
    for line in infile:
        transformed_data = transform_line(line)
        dataset.append(transformed_data)

##### Test data

In [4]:
input_path = '/content/MathCoder/data/MATH_test_post.jsonl'   # Path to the input JSON file
# output_path = 'output.json' # Path to the output JSON file

test_dataset = []
with open(input_path, 'r', encoding='utf-8') as infile:
    for line in infile:
        transformed_data = transform_line(line)
        test_dataset.append(transformed_data)
        # test_dataset.append({"text": transformed_data})

##### Each dataset length

In [5]:
len(dataset), len(test_dataset)

(7500, 5000)

### Tune Model
Llama2

### Model init

In [5]:
cache_dir='cache'
model_id = "meta-llama/Llama-2-7b-chat-hf"

In [24]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # config=config,
    quantization_config=bnb_config,
    # rope_scaling={"type": "linear", "factor": 2.0},
    device_map='auto',
    # trust_remote_code=False,
    torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2", # works with Llama models and reduces memory reqs
    cache_dir=cache_dir)

tokenizer = AutoTokenizer.from_pretrained(model_id,use_fast=True,trust_remote_code=True)



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

### Tuning

#### Preparation

In [26]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model and lists which parameters are trainable.
    """
    trainable_params = 0
    non_trainable_params = 0
    all_params = 0

    # print("Trainable Parameters:")
    for name, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
            # print(f"  {name}")
        else:
            non_trainable_params += param.numel()

    print(
        f"\nSummary:\n  Trainable params: {trainable_params}\n  Non-Trainable params: {non_trainable_params}\n  All params: {all_params}\n  Trainable%: {100 * trainable_params / all_params}"
    )

In [27]:
peft_config = LoraConfig( #matching the Llama recipe
    r=4,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config) #move to a peft model

In [28]:
print_trainable_parameters(model)


Summary:
  Trainable params: 4194304
  Non-Trainable params: 3500412928
  All params: 3504607232
  Trainable%: 0.11967971650867153


In [31]:
messages=[
    { 'role': 'user', 'content': "write a quick sort algorithm in python."},
    { 'role': 'assistant', 'content': "here you are."},
    { 'role': 'user', 'content': "great."},
]

inputs = tokenizer.apply_chat_template(messages, tokenize=False)
print(inputs)

<s>[INST] write a quick sort algorithm in python. [/INST] here you are. </s><s>[INST] great. [/INST]


In [32]:
print(tokenizer.pad_token)

None


In [33]:
if '<pad>' in tokenizer.get_vocab():
    print('<pad> token is in the tokenizer. Using <pad> for pad')
    tokenizer.pad_token = '<pad>'
elif '<|pad|>' in tokenizer.get_vocab():
    print('<|pad|> token is in the tokenizer. Using <|pad|> for pad')
    tokenizer.pad_token = '<|pad|>'
elif '<unk>' in tokenizer.get_vocab():
    print('<unk> token is in the tokenizer. Using unk for pad')
    tokenizer.pad_token = '<unk>'
else:
    print(f'Using EOS token, {tokenizer.eos_token}, for padding. WARNING, this may not be ideal for chat fine-tuning models.')
    tokenizer.pad_token = tokenizer.eos_token

<unk> token is in the tokenizer. Using unk for pad


In [35]:
model.pad_token_id = tokenizer.pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# Check if they are equal
assert model.pad_token_id == tokenizer.pad_token_id, "The model's pad token ID does not match the tokenizer's pad token ID!"

# Print the pad token ids
print('Tokenizer pad token ID:', tokenizer.pad_token_id)
print('Model pad token ID:', model.pad_token_id)
print('Model config pad token ID:', model.config.pad_token_id)
print('Number of tokens now in tokenizer:', tokenizer.vocab_size)

Tokenizer pad token ID: 0
Model pad token ID: 0
Model config pad token ID: 0
Number of tokens now in tokenizer: 32000


In [36]:
print("Special tokens map:", tokenizer.special_tokens_map)

Special tokens map: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}


In [37]:
tokenizer.apply_chat_template(dataset[0], tokenize=False, add_generation_prompt=True)

'<s>[INST] The points $P,$ $Q,$ and $R$ are represented by the complex numbers $z,$ $(1 + i) z,$ and $2 \\overline{z},$ respectively, where $|z| = 1.$  When $P,$ $Q$, and $R$ are not collinear, let $S$ be the fourth vertex of the parallelogram $PQSR.$  What is the maximum distance between $S$ and the origin of the complex plane? [/INST] Let $w$ be the complex number corresponding to the point $S.$  Since $PQSR$ is a parallelogram,\n\\[w = (1 + i) z + 2 \\overline{z} - z,\\]so $w = 2 \\overline{z} + iz.$  Then $\\overline{w} = 2z - i \\overline{z},$ so\n\\begin{align*}\n|w|^2 &= w \\overline{w} \\\\\n&= (2 \\overline{z} + iz)(2z - i \\overline{z}) \\\\\n&= 4 z \\overline{z} + 2iz^2 - 2i \\overline{z}^2 + z \\overline{z} \\\\\n&= 5|z|^2 + 2i (z^2 - \\overline{z}^2) \\\\\n&= 2i (z^2 - \\overline{z}^2) + 5.\n\\end{align*}Let $z = x + yi,$ where $x$ and $y$ are real numbers.  Since $|z| = 1,$ $x^2 + y^2 = 1.$  Also,\n\\begin{align*}\n2i (z^2 - \\overline{z}^2) &= 2i ((x + yi)^2 - (x - yi)^2

In [38]:
dataset_llm = []
for dialogue in dataset:
    dataset_llm.append(tokenizer.apply_chat_template(dialogue, tokenize=False, add_generation_prompt=True))
df_llm = pd.DataFrame(list(dataset_llm))
df_llm = df_llm[df_llm[0].str.len()<context_length]
df_llm = df_llm.sample(frac=1).reset_index(drop=True)[:100]
dataset_final = Dataset.from_pandas(df_llm)

test_dataset_llm = []
for dialogue in test_dataset:
    test_dataset_llm.append(tokenizer.apply_chat_template(dialogue, tokenize=False, add_generation_prompt=True))
test_df_llm = pd.DataFrame(list(test_dataset_llm))
test_df_llm = test_df_llm[test_df_llm[0].str.len()<context_length]
test_df_llm = test_df_llm.sample(frac=1).reset_index(drop=True)[:100]
test_dataset_final = Dataset.from_pandas(test_df_llm)

In [39]:
# Define a stream
def stream(user_prompt, model_type, tokenizer, checkpoint=''):
    if model_type == 'base':
        eval_model = model
    elif model_type == 'fine-tuned':
        eval_model = PeftModel.from_pretrained(model, checkpoint)  # Assuming PeftModel is the intended class
        eval_model = eval_model.to("cuda")
        print('PRETRAINED')
        for n, p in eval_model.named_parameters():
            if p.device.type == "cpu":
                print(f"{n} is on cpu!")
    else:
        print('You must set the model_type to base or fine-tuned')
        exit()  # or raise an exception

    eval_model.config.use_cache = True
    messages=[
        { 'role': 'user', 'content': f"{user_prompt.strip()}"},
    ]

    inputs = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([inputs], return_tensors="pt", add_special_tokens=False).to("cuda")

    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]

    streamer = TextStreamer(tokenizer)
    _ = eval_model.generate(**inputs, streamer=streamer, max_new_tokens=400, do_sample=False, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)

    # Clear GPU cache and run garbage collection
    torch.cuda.empty_cache()  # Clear GPU cache
    gc.collect()  # Run garbage collection


def evaluation(model_type, tokenizer, checkpoint=''):
    questions = ['Find the minimum value of $\\frac{9x^2\\sin^2 x + 4}{x\\sin x}$ for $0 < x < \\pi.$',
                'All 50 states as well as the District of Columbia and Puerto Rico, have distinct two-letter postal abbreviations. If a two-letter sequence of letters (such as CO or EE) is chosen at random, what is the probability that it is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico? Express your answer as a common fraction.',
                'Let $a$ and $b$ be real numbers such that $a > b > 0.$  Determine the minimum value of\n\\[a + \\frac{1}{b(a - b)}.\\]',
                'The quadratic $3x^2-24x+72$ can be written in the form $a(x+b)^2+c$, where $a$, $b$, and $c$ are constants. What is $a+b+c$?',
                # 'In how many ways can  5 students be selected from a group of 6 students?',
                # 'You have two circles, one with radius $r$ and the other with radius $R$. You wish for the difference in the areas of these two circles to be less than or equal to 5$\\pi$. If $r+R=10$, what is the maximum difference in the lengths of the radii?',
                # 'For a complex number $z,$ compute the minimum value of\n\\[|z + 5 - 3i| + |z - 7 + 2i|.\\]',
                # 'In $\\triangle ABC$, what is the value of $x + y$? [asy]\nimport olympiad;\nsize(7cm);\n\npair a = dir(76);\npair b = (-1, 0);\npair c = (1, 0);\npair o = (0, 0);\n\ndraw(a--b--c--cycle);\ndraw(a--o);\n\nlabel("$A$", a, N); label("$B$", b, SW); label("$C$", c, SE);\nlabel("$104^\\circ$", o, 1.8 * NW + 0.4 * E);\nlabel("$x^\\circ$", b, 3 * E + NE + NE);\nlabel("$y^\\circ$", c, 2 * W + 2 * NW);\n\nadd(pathticks(b--o, s=3));\nadd(pathticks(c--o, s=3));\nadd(pathticks(a--o, s=3));\n\n[/asy]',
                # 'What is the greatest integer less than or equal to \\[\\frac{3^{100}+2^{100}}{3^{96}+2^{96}}?\\]',
                'In how many ways can 5 different keys be placed on a keychain?  (Two arrangements are considered the same if one can be rotated or flipped to be identical to the other.)'
                ]

    answers = ['The given expression can be rewritten as \\[9x\\sin x + \\frac{4}{x \\sin x}\\]or $9y + \\frac{4}{y},$ where $y = x \\sin x.$ By the AM-GM inequality, we have \\[\\frac{9y + \\frac{4}{y}}{2} \\ge \\sqrt{9y \\cdot \\frac{4}{y}} = 6,\\]so $9y + \\frac{4}{y} \\ge \\boxed{12},$ which is the answer.\n\nTo see that the minimum is achievable, recall that equality in AM-GM holds when all the terms are equal. Therefore, we want $9y = \\tfrac{4}{y},$ or $y = x \\sin x = \\tfrac{2}{3}.$ Since $x \\sin x$ is a continuous function of $x,$ and $0 \\sin 0 = 0 < \\tfrac{2}{3}$ while $\\tfrac{\\pi}{2} \\sin \\tfrac{\\pi}{2} = \\tfrac{\\pi}{2} > \\tfrac{2}{3},$ the equation $x \\sin x = \\tfrac{2}{3}$ must have a solution in the given interval. Therefore, equality holds for some value of $x.$ Answer: 12',
              'There are $26\\cdot 26$ possible two-letter sequences of letters, since we have 26 choices for the first and then 26 choices for the second.  But only 52 of these possibilities are valid, so our answer is $\\frac{52}{26\\cdot 26} =\\boxed{ \\frac{1}{13}}$. Answer: \\frac{1}{13}',
              'We can write\n\\[a + \\frac{1}{b(a - b)} = (a - b) + b + \\frac{1}{b(a - b)}.\\]By AM-GM,\n\\[(a - b) + b + \\frac{1}{b(a - b)} \\ge 3 \\sqrt[3]{(a - b)b \\cdot \\frac{1}{b(a - b)}} = 3.\\]Equality occurs when $a = 2$ and $b = 1,$ so the minimum value is $\\boxed{3}.$ Answer: 3',
              'We complete the square.\n\nFactoring $3$ out of the quadratic and linear terms gives $3x^2 - 24x = 3(x^2 - 8x)$.\n\nSince $(x-4)^2 = x^2 - 8x + 16$, we can write $$3(x-4)^2 = 3x^2 - 24x + 48.$$This quadratic agrees with the given $3x^2-24x+72$ in all but the constant term. We can write\n\n\\begin{align*}\n3x^2 - 24x + 72 &= (3x^2 - 24x + 48) + 24 \\\\\n&= 3(x-4)^2 + 24.\n\\end{align*}Therefore, $a=3$, $b=-4$, $c=24$, and $a+b+c = 3-4+24 = \\boxed{23}$. Answer: 23',
              # 'We can choose 5 students out of a group of 6 students  without regard to order in $\\binom{6}{5} = \\boxed{6}$ ways. Answer: 6',
              # 'We want $\\pi R^{2}-\\pi r^{2}\\leq 5\\pi$. Dividing by $\\pi$, we have $R^{2}-r^{2}\\leq 5$. Factor the left-hand side to get $(R+r)(R-r)\\leq 5$. Substituting 10 for $R+r$ gives $10(R-r)\\leq 5 \\implies R-r \\leq 1/2$. So the maximum difference in the lengths of the radii is $\\boxed{\\frac{1}{2}}$. Answer: \\frac{1}{2}',
              # 'Geometrically, $|z + 5 - 3i|$ is the distance between the complex numbers $z$ and $-5 + 3i$ in the complex plane, and $|z - 7 + 2i|$ is the distance between $z$ and $7 - 2i.$\n\n[asy]\nunitsize(0.4 cm);\n\npair A, B, Z;\n\nA = (-5,3);\nB = (7,-2);\nZ = (6,6);\n\ndraw(A--B);\ndraw(A--Z--B);\n\ndot("$-5 + 3i$", A, NW);\ndot("$7 - 2i$", B, SE);\ndot("$z$", Z, NE);\n[/asy]\n\nBy the Triangle Inequality, the sum of the distances is minimized when $z$ lies on the line segment connecting the two complex numbers $-5 + 3i$ and $7- 2i,$ in which case the sum of the distances is simply $|(5 - 3i) - (-7 + 2i)| = |12 - 5i| = \\boxed{13}.$ Answer: 13',
              # 'Since $\\triangle BDA$ is isosceles, $\\angle BAD = \\angle ABD = x^\\circ$.\nSince $\\triangle CDA$ is isosceles, $\\angle CAD = \\angle ACD = y^\\circ$. [asy]\nimport olympiad;\nsize(7cm);\n\npair a = dir(76);\npair b = (-1, 0);\npair c = (1, 0);\npair o = (0, 0);\n\ndraw(a--b--c--cycle);\ndraw(a--o);\n\nlabel("$A$", a, N); label("$B$", b, SW); label("$C$", c, SE); label("$D$", o, S);\nlabel("$104^\\circ$", o, 1.8 * NW + 0.4 * E);\nlabel("$x^\\circ$", b, 3 * E + NE + NE);\nlabel("$y^\\circ$", c, 2 * W + 2 * NW);\n\nadd(pathticks(b--o, s=3));\nadd(pathticks(c--o, s=3));\nadd(pathticks(a--o, s=3));\n\nlabel("$x^\\circ$", a, 3 * S + 2 * SW + W); label("$y^\\circ$", a, 3 * S + SE);\n\n[/asy] Therefore, $\\angle BAC = (x + y)^\\circ$.\n\nSince the sum of the angles in $\\triangle ABC$ is $180^\\circ$, we have \\begin{align*}\nx + y + (x + y) &= 180\\\\\n2x + 2y &= 180\\\\\nx + y &= 90.\n\\end{align*}Therefore, $x + y = \\boxed{90}$. Answer: 90',
              # 'Since the powers of 3 are much greater than the corresponding powers of 2, we expect the fraction to be approximately $\\frac{3^{100}}{3^{96}} = 81.$\n\nTo make this more precise, let $a = 3^{96}$ and $b = 2^{96}.$  Then\n\\begin{align*}\n\\frac{3^{100} + 2^{100}}{3^{96} + 2^{96}} &= \\frac{81a + 16b}{a + b} \\\\\n&= \\frac{81a + 81b - 65b}{a + b} \\\\\n&= 81 - \\frac{65b}{a + b}.\n\\end{align*}Thus, the fraction is certainly less than 81.  Now,\n\\[\\frac{65b}{a + b} < \\frac{65b}{a} = 65 \\left( \\frac{2}{3} \\right)^{96} = 65 \\left( \\frac{4}{9} \\right)^{48} < 65 \\left( \\frac{1}{2} \\right)^{48} < 65 \\left( \\frac{1}{2} \\right)^7 = \\frac{65}{128} < 1.\\]Therefore, the fraction is greater than 80.  Hence, the answer is $\\boxed{80}.$ Answer: 80',
              'There are $5!$ ways to place the keys on the keychain, but we must divide by 5 for rotational symmetry (5 rotations for each arrangement), and by 2 for reflectional symmetry (we can flip the keychain to get the same arrangement).  The answer is $\\dfrac{5!}{5 \\times 2} = \\boxed{12}$. Answer: 12'
              ]

    for question, answer in zip(questions, answers):
        stream(question, model_type, tokenizer, checkpoint)
        print("Correct Answer:", answer)
        print('\n\n')

In [40]:
questions = []
answers = []

for i in range(10):
  rand_idx = np.random.randint(len(test_dataset))
  questions.append(test_dataset[rand_idx][0]['content'])
  answers.append(test_dataset[rand_idx][1]['content'])

In [41]:
evaluation("base", tokenizer)



<s> [INST] Find the minimum value of $\frac{9x^2\sin^2 x + 4}{x\sin x}$ for $0 < x < \pi.$ [/INST]  To find the minimum value of the given function, we need to find the critical point(s) and determine the corresponding minimum value(s).
Unterscheidung between the function and its first and second derivatives at the critical point(s) will help us determine the minimum value.

First, let's find the derivative of the function:

$$\frac{d}{dx} \frac{9x^2\sin^2 x + 4}{x\sin x} = \frac{18x\sin x - 4\cos^2 x}{x\sin x}$$

Now, let's find the critical points by setting the derivative to $0$:

$$18x\sin x - 4\cos^2 x = 0$$

Solving for $x$, we get:

$$x = \frac{4\cos^2 \theta}{18}$$

where $\theta$ is a constant.

There are two critical points:

1. $x = 0$, which is a minimum point.
2. $x = \frac{4\cos^2 \theta}{18}$, which is a saddle point.

To determine the minimum value, we need to find the minimum value of the function at each critical point.

At the minimum point $x = 0$, we have:

$$\frac



[/INST]  To calculate the probability that a randomly chosen two-letter sequence is a postal abbreviation for one of the 50 states, the District of Columbia, or Puerto Rico, we need to count the number of possible abbreviations for each of these entities and divide by the total number of possible two-letter sequences.

There are 50 states in the United States, each with a unique two-letter postal abbreviation:

Alabama - AL
Alaska - AK
Arizona - AZ
Arkansas - AR
California - CA
Colorado - CO
Connecticut - CT
Delaware - DE
Florida - FL
Georgia - GA
Hawaii - HI
Idaho - ID
Illinois - IL
Indiana - IN
Iowa - IA
Kansas - KS
Kentucky - KY
Louisiana - LA
Maine - ME
Maryland - MD
Massachusetts - MA
Michigan - MI
Minnesota - MN
Mississippi - MS
Missouri - MO
Nebraska - NE
Nevada - NV
New Hampshire - NH
New Jersey - NJ
New Mexico - NM
New York - NY
North Carolina - NC
North Dakota - ND
Ohio - OH
Oklahoma - OK
Oregon - OR
Pennsylvania - PA
Rhode Island - RI
South Carolina - SC
South Dakota - SD
Te

#### Tuning

In [None]:
model_name = model_id.split("/")[-1]

epochs=1
context_length = 2048
grad_accum=1
batch_size=1
fine_tune_tag='math'
save_dir = f'./results/{model_name}_{epochs}_epochs_{context_length}_length-{fine_tune_tag}'
print(save_dir)

In [None]:
# Custom callback to log metrics
class LoggingCallback(transformers.TrainerCallback):
    def __init__(self, log_file_path):
        self.log_file_path = log_file_path
        self.save_dir = save_dir

    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        with open(self.log_file_path, 'a') as f:
            if 'loss' in logs:
                f.write(f"Step: {state.global_step}, Training Loss: {logs['loss']}\n")
            if 'eval_loss' in logs:
                f.write(f"Step: {state.global_step}, Eval Loss: {logs['eval_loss']}\n")
            f.flush()  # Force flush the buffered data to file

        # Check if the current step is a checkpoint step
        if state.global_step % int(args.save_steps) == 0:
            # Check if the last checkpoint path exists
            if state.best_model_checkpoint:
                checkpoint_dir = state.best_model_checkpoint
            else:
                # If not, construct the checkpoint directory path manually
                checkpoint_dir = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")

            # Ensure the checkpoint directory exists
            os.makedirs(checkpoint_dir, exist_ok=True)

            # Save trainable params in the checkpoint directory
            current_trainable_params = {n: p for n, p in model.named_parameters() if p.requires_grad}
            current_trainable_params_state_dict = {n: p.data for n, p in current_trainable_params.items()}
            file_path = os.path.join(checkpoint_dir, "trainable_params.bin")
            torch.save(current_trainable_params_state_dict, file_path)

# Log file path
log_file_path = os.path.join(cache_dir, "training_logs.txt")

# Create an instance of the custom callback
logging_callback = LoggingCallback(log_file_path)

In [None]:
trainer = SFTTrainer(
    # peft_config=peft_config, #comment out if passing a peft model directly as 'model'
    dataset_text_field="0",
    max_seq_length=context_length,
    tokenizer=tokenizer,
    model=model,
    train_dataset=dataset_final,
    eval_dataset=test_dataset_final,
    args=transformers.TrainingArguments(
        save_steps=50,
        logging_steps=1,
        num_train_epochs=epochs,
        output_dir=save_dir,
        evaluation_strategy="steps",
        do_eval=True,
        eval_steps=0.2,
        per_device_eval_batch_size=batch_size,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=grad_accum,
        log_level="debug",
        # optim="paged_adamw_8bit",
        # fp16=True, #For non-Ampere GPUs
        # bf16=True, # For Ampere GPUs
        max_grad_norm=0.3,
        lr_scheduler_type="cosine",
        hub_private_repo=True,
        warmup_ratio=0.03,
        optim="adamw_torch",
        learning_rate=1e-4
    ),
    callbacks=[logging_callback],
)

In [None]:
model.config.use_cache = False
trainer.train()

In [None]:
# Initialize lists to hold training and evaluation losses and steps
train_losses = []
eval_losses = []
train_steps = []
eval_steps = []

# Populate the lists from the log history
for entry in trainer.state.log_history:
    if 'loss' in entry:
        train_losses.append(entry['loss'])
        train_steps.append(entry['step'])
    if 'eval_loss' in entry:
        eval_losses.append(entry['eval_loss'])
        eval_steps.append(entry['step'])

# Plot the losses
plt.plot(train_steps, train_losses, label='Train Loss')
plt.plot(eval_steps, eval_losses, label='Eval Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()

#### Evaluation

In [None]:
evaluation("base", tokenizer)

In [None]:
evaluation("fine-tuned", tokenizer, checkpoint='/content/results/Llama-2-7b-chat-hf_1_epochs_2048_length-mathproblems/checkpoint-100')

In [None]:
evaluation("fine-tuned", tokenizer, checkpoint='/content/content/results/Llama-2-7b-chat-hf_1_epochs_2048_length-mathproblems/checkpoint-400')

### Load Tuned Model

In [None]:
!unzip file.zip

In [None]:

max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "larryvrh/Yi-34B-200K-Llamafied", # Choose ANY! eg mistralai/Mistral-7B-Instruct-v0.2
    max_seq_length = max_seq_length,
    attn_implementation="flash_attention_2",
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha = 32,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)