### Fine-tuning a 34B CodeLlama model with SWE-bench_Lite

In [28]:
%%capture
import torch
# major_version, minor_version = torch.cuda.get_device_capability()
# # pinning to bb81079ca1dba43fc2cdb79a81ce6edf23f87907 because master breaks
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git@bb81079ca1dba43fc2cdb79a81ce6edf23f87907"
# if major_version >= 8:
#     # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
#     !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
# else:
#     # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
#     !pip install --no-deps xformers trl peft accelerate bitsandbytes
# pass
# %%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [29]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 16384 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/codellama-34b-bnb-4bit", # "codellama/CodeLlama-34b-hf" for 16bit loading
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    use_gradient_checkpointing = True,
    random_state = 3407,
    max_seq_length = max_seq_length,
)

<a name="Data"></a>
### Data Prep
We now are going to extend the original SWE-bench_Lite dataset.
It will have more context.

In [None]:
%%capture
!pip install PyGithub python-dotenv
!pip install datasets,trl,transformers

In [None]:
import os
import re
from datasets import load_dataset
from dotenv import load_dotenv
from github import Github
from github import Auth
from google.colab import userdata
from trl import SFTTrainer
from transformers import TrainingArguments
# userdata.get('GITHUB_TOKEN')

# os.environ['GITHUB_TOKEN'] = userdata.get('GITHUB_TOKEN')
# os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

Quant finance evaluation benchmarks and datasets(temporary):
1. Sentiment Analysis - FPB
2. Text Classification - FedNLP
3. Named Entity Recognition - FIN
4. Question Answering - FinQA-QA
5. Stock Movement Prediction - StockNet
6. Text Summarization - ECT-sum
7. ESG classification - ESG

TBC: advanced topics; more datasets


### 1. Sentiment Analysis




FPB (https://huggingface.co/datasets/takala/financial_phrasebank/blob/main/README.md)
use 75% agree rate

In [None]:
!git clone https://github.com/LOV8D/AI4Fin.git

In [20]:
fpb_train = load_dataset("takala/financial_phrasebank",'sentences_75agree',split='train[:80%]')
fpb_train

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['sentence', 'label'],
    num_rows: 2762
})

In [None]:
trainer = SFTTrainer(
    model = model,
    train_dataset = fpb_train,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
    ),
)
trainer.train()

In [None]:
# dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split = "dev")

In [None]:
print(f'Number of examples in the training set: {len(dataset)}')
print('Keys of the dataset:')
for key in dataset[2].keys():
    print(f'- {key}')

Number of examples in the training set: 23
Keys of the dataset:
- repo
- instance_id
- base_commit
- patch
- test_patch
- problem_statement
- hints_text
- created_at
- version
- FAIL_TO_PASS
- PASS_TO_PASS
- environment_setup_commit


In [None]:
# list all the unique repo names
repo_names = set()
for instance in dataset:
    repo_names.add(instance['repo'])

## Utility functions to handle Github repos

In [None]:
def gh_repo_cloner(repo_names):
    access_token = os.environ.get('GITHUB_TOKEN')
    if not access_token:
        print("Error: GITHUB_TOKEN environment variable is not set.")
        return None

    auth = Auth.Token(access_token)
    g = Github(auth=auth)
    git_repos = {repo_name: g.get_repo(repo_name) or print(f'Failed to clone the repository {repo_name}') for repo_name in repo_names}

    return git_repos

def get_filepath(patch):
    lines = patch.split('\n')
    pattern = r'(a/.*?|b/.*)(?= )'  # Regex pattern to match file paths
    match = re.search(pattern, lines[0])
    if match:
        file_path = match.group(1)
        if file_path.startswith('a/'):
            return file_path[2:]
        elif file_path.startswith('b/'):
            return file_path[2:]

    return None

def get_file_content(repo, base_commit, file_path):
    file_content = {}
    commit = repo.get_commit(base_commit)

    try:
        file_content = repo.get_contents(file_path, ref=commit.commit.sha).decoded_content
        return file_content.decode()
    except Exception as e:
        print(f"Error retrieving file content: {e}")
        return None

### Reference all github repos in the dataset

In [None]:
git_repos = gh_repo_cloner(repo_names)

### Extended Dataset

In [None]:
def add_file_content_to_dataset(example):
    file_path = get_filepath(example["patch"])
    file_content = get_file_content(git_repos[example["repo"]], example["base_commit"], file_path)

    return {"file_path":file_path,"file_content": file_content}

In [None]:
dataset = dataset.map(add_file_content_to_dataset)

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

In [None]:
def filter_fn(example):
    return len(example["file_content"]) + len(example["file_path"]) + len(example["problem_statement"]) < 16000


In [None]:
dataset = dataset.filter(filter_fn)

Filter:   0%|          | 0/23 [00:00<?, ? examples/s]

In [None]:
# Save preds output to Drive
from google.colab import drive
import shutil

dataset_dir = "SWE-bench_Lite_Dev_Extended"
dataset.save_to_disk(dataset_dir) # save this

drive.mount('/content/drive')

shutil.copytree(f'/content/{dataset_dir}', f'/content/drive/MyDrive/xcs224u/datasets/{dataset_dir}')

Saving the dataset (0/1 shards):   0%|          | 0/23 [00:00<?, ? examples/s]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/xcs224u/datasets/SWE-bench_Lite_Dev_Extended'

In [None]:
dataset.push_to_hub("ricardo-larosa/SWE-bench_Lite_Dev_Extended")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ricardo-larosa/SWE-bench_Lite_Dev_Extended/commit/9ceac28778588c0ad0802eb1421f72722610e160', commit_message='Upload dataset', commit_description='', oid='9ceac28778588c0ad0802eb1421f72722610e160', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
#@title Alpaca prompting style
alpaca_prompt = """Please generate a git patch that will solve the github issue described given the file_path and file_content:

file_path: {}

file_content:
{}
git_patch:
"""

def formatting_prompts_func(example):
    filepaths = example["file_path"]
    filecontents = example["file_content"]
    texts = []
    for filename, filecontent in zip(filepaths, filecontents):
        text = alpaca_prompt.format(filename, filecontent) + tokenizer.eos_token # Must add EOS_TOKEN
        texts.append(text)
    return { "text" : texts,}
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [None]:
dataset = load_dataset("princeton-nlp/SWE-bench_Lite_bm25_13K", split = "dev")

Downloading readme:   0%|          | 0.00/2.64k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/665k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.90M [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/23 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/300 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments

response_template = "patch"
collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    data_collator=collator,
    max_seq_length = max_seq_length,
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
38.945 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

969.1464 seconds used for training.
16.15 minutes used for training.
Peak reserved memory = 32.34 GB.
Peak reserved memory for training = 14.299 GB.
Peak reserved memory % of max memory = 81.741 %.
Peak reserved memory for training % of max memory = 36.141 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
test_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split = "test")

In [None]:
test_instance = test_dataset[11]

In [None]:
test_repo_names = set()
test_repo_names.add(test_instance["repo"])
test_git_repos = gh_repo_cloner(test_repo_names)
test_git_repo = test_git_repos[test_instance["repo"]]

In [None]:
issue = test_instance["problem_statement"]

In [None]:
file_path = get_filepath(test_instance["patch"])

In [None]:
file_content = get_file_content(test_git_repo, test_instance["base_commit"], file_path)

In [None]:
print(len(file_content))

2201


In [None]:
print(file_content)

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    # alpaca_prompt.format(
    #     issue,
    #     file_path,
    #     file_content,
    #     "", # output - leaving this blank for generation
    # )
    issue
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

# FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# inputs = tokenizer(
# [
#     alpaca_prompt.format(
#         issue,
#         file_path,
#         file_content,
#         "", # output - leaving this blank for generation
#     )
# ], return_tensors = "pt").to("cuda")

# from transformers import TextStreamer
# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 4096)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


["<s> Symbol instances have __dict__ since 1.7?\nIn version 1.6.2 Symbol instances had no `__dict__` attribute\r\n```python\r\n>>> sympy.Symbol('s').__dict__\r\n---------------------------------------------------------------------------\r\nAttributeError                            Traceback (most recent call last)\r\n<ipython-input-3-e2060d5eec73> in <module>\r\n----> 1 sympy.Symbol('s').__dict__\r\n\r\nAttributeError: 'Symbol' object has no attribute '__dict__'\r\n>>> sympy.Symbol('s').__slots__\r\n('name',)\r\n```\r\n\r\nThis changes in 1.7 where `sympy.Symbol('s').__dict__` now exists (and returns an empty dict)\r\nI may misinterpret this, but given the purpose of `__slots__`, I assume this is a bug, introduced because some parent class accidentally stopped defining `__slots__`.\n\n## Expected Behaviour\r\nSymbol instances should not have a `__dict__` attribute.\r\n\r\n## Observed Behaviour\r\nSymbol instances have a `__dict__` attribute.\r\n\r\n## Steps to Reproduce\r\n1. Create a 

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

To save to `GGUF` / `llama.cpp`, or for model merging, use `model.merge_and_unload` first, then save the model. Maxime Labonne's [llm-course](https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html) has a nice tutorial on converting HF to GGUF! This [issue](https://github.com/ggerganov/llama.cpp/issues/3097) might be helpful for more info.

In [None]:
# model.save_pretrained("") # Local saving
model.push_to_hub("ricardo-larosa/SWE_Lite_dev-CodeLlama-34b") # Online saving

adapter_model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Saved model to https://huggingface.co/ricardo-larosa/SWE_Lite_dev-CodeLlama-34b
