In [1]:
!which python

/Users/mesa/Coding Projects/LLM-Debug-Assistant/venv/bin/python


In [26]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [27]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-2-7b-chat-hf"

# New instruction dataset
stack_overflow_dataset = "Mxode/StackOverflow-QA-C-Language-40k"
# other datasets


# Fine-tuned model
new_model = "llama-2-7b-chat-stack-overflow"

In [28]:
dataset = load_dataset(stack_overflow_dataset, split="train")

Downloading readme:   0%|          | 0.00/893 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 29.9M/29.9M [00:02<00:00, 11.4MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

In [29]:
# dataset currently has 'question' and 'answer' columns. We need to combine them into a single column named 'text'
dataset = dataset.map(
    lambda examples: {
        "text": "<s>[INST]" + examples["question"] + "[\INST] " + examples["answer"] + "</s>",
    }
)

Map:   0%|          | 0/40649 [00:00<?, ? examples/s]

In [30]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [31]:
#!pip install accelerate

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,
    #low_cpu_mem_usage=True,
    # quantization_config=quant_config,
    # device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [32]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [33]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [34]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [35]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)



Map:   0%|          | 0/40649 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [36]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)


('llama-2-7b-chat-stack-overflow/tokenizer_config.json',
 'llama-2-7b-chat-stack-overflow/special_tokens_map.json',
 'llama-2-7b-chat-stack-overflow/tokenizer.json')

Now lets compare asking 3 questions to our fine tuned model vs our base model

## fine tuned model

In [37]:
#logging.set_verbosity(logging.CRITICAL)

prompt = "How do I loop through an array in C?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] How do I loop through an array in C? [/INST]  In C, you can loop through an array using a `for` loop or a `while` loop. Here are the basic syntaxes for each:

For loop:
```
for (int i = 0; i < array_size; i++) {
    // loop body
}
```
In this syntax, `array_size` is the number of elements in the array, and `i` is the index of the current element.

While loop:
```
int i = 0;
while (i < array_size) {
    // loop body
    i++;
}
```
In this syntax, `i` is initialized to 0, and the loop continues until `i` is greater than or equal to `array_size`.

You can also use a `for_each` function in


In [38]:
prompt = "What does the 'sizeof' operator do in C?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] What does the 'sizeof' operator do in C? [/INST]  In C, the `sizeof` operator is used to determine the size of a variable or an object in memory.

The `sizeof` operator takes a single argument, which can be a variable, a type, or a constant expression. The operator returns the size of the object or variable in bytes.

Here are some examples of how to use the `sizeof` operator in C:

1. To get the size of a variable:
```
int x = 5;
printf("The size of x is %zu\n", sizeof(x)); // Output: The size of x is 4
```
In this example, `x` is an `int` variable with a size of 4 bytes.

2. To get the size of a type:
```
struct Person {
    int age;
    char name[


In [39]:
prompt = "I'm working on a C project in Visual Studio Code and I would like to use GNU formatting in the project. Is there any extension to achieve this or any way to go about it??"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] I'm working on a C project in Visual Studio Code and I would like to use GNU formatting in the project. Is there any extension to achieve this or any way to go about it?? [/INST]  Yes, there are several extensions available in Visual Studio Code that can help you format your code using GNU formatting rules. Here are a few options you can consider:

1. GNU Formatter Extension: This extension provides a simple way to format your code using GNU formatting rules. You can install it by searching for "GNU Formatter" in the Visual Studio Code Marketplace. Once installed, you can use the "GNU Formatter" command in the Command Palette (Ctrl + Shift + P on Windows or Command + Shift + P on Mac) to format your code.
2. Prettier: Prettier is a popular code formatter that supports a wide range of programming languages, including C. You can install


## base model

In [40]:

prompt = "How do I loop through an array in C?"
pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<s>[INST] How do I loop through an array in C? [/INST]  In C, you can loop through an array using a `for` loop or a `while` loop. Here are the basic syntaxes for each:

For loop:
```
for (int i = 0; i < array_size; i++) {
    // loop body
}
```
In this syntax, `array_size` is the number of elements in the array, and `i` is the index of the current element.

While loop:
```
int i = 0;
while (i < array_size) {
    // loop body
    i++;
}
```
In this syntax, `i` is initialized to 0, and the loop continues until `i` is greater than or equal to `array_size`.

You can also use a `for_each` function in


In [41]:
prompt = "What does the 'sizeof' operator do in C?"
pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<s>[INST] What does the 'sizeof' operator do in C? [/INST]  In C, the `sizeof` operator is used to determine the size of a variable or an object in memory.

The `sizeof` operator takes a single argument, which can be a variable, a type, or a constant expression. The operator returns the size of the object or variable in bytes.

Here are some examples of how to use the `sizeof` operator in C:

1. To get the size of a variable:
```
int x = 5;
printf("The size of x is %zu\n", sizeof(x)); // Output: The size of x is 4
```
In this example, `x` is an integer variable with a size of 4 bytes.

2. To get the size of a type:
```
struct Person {
    int age;
    char name[20


In [42]:
prompt = "I'm working on a C project in Visual Studio Code and I would like to use GNU formatting in the project. Is there any extension to achieve this or any way to go about it??"
pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<s>[INST] I'm working on a C project in Visual Studio Code and I would like to use GNU formatting in the project. Is there any extension to achieve this or any way to go about it?? [/INST]  Yes, there are several extensions available in Visual Studio Code that can help you format your code using GNU formatting rules. Here are a few options:

1. GNU Formatter Extension: This extension provides a simple way to format your code using GNU formatting rules. You can install it by searching for "GNU Formatter" in the Visual Studio Code Marketplace.
2. Prettier: Prettier is a popular code formatter that supports a wide range of programming languages, including C. You can install it by searching for "Prettier" in the Visual Studio Code Marketplace.
3. Code Beautifier: Code Beautifier is another extension that can help you format your code using GNU formatting rules. You can install it


These questions show that while the pretrained model is already quite smart, it lacks some knowledge provided by our dataset.

## Evaluation

In [25]:
from tensorboard import notebook
log_dir = "results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))

Launching TensorBoard...