# Import libraries

In [1]:
import os
import math
import wandb
import torch
from tqdm.auto import tqdm

from torch.utils.data import DataLoader
from torch.optim import AdamW

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from transformers import DataCollatorForSeq2Seq

from transformers import get_scheduler

from peft import LoraConfig, TaskType
from peft import get_peft_model

In [2]:
# Import utils from ../src/utils
import sys
sys.path.append('..')

In [3]:
from utils.datasets import get_code_alpaca_20k
from utils.completions import clean_completion, inference
from utils.evaluation import evaluate

# Set Constants: Model, Tokenizer & Data 

## Model

In [4]:
"""
The difference between “it” aka “Instruction Tuned”
and the base model is that the “it” variants are better for chat purposes
since they have been fine-tuned to better understand the instructions
and generate better answers while the base variants are those that have not undergone
under any sort of fine-tuning. They can still generate answers but not as good as the “it” one.

"""
# google/gemma-2b | google/gemma-2b-it | microsoft/phi-2
# Qwen/Qwen1.5-0.5B | Qwen/Qwen1.5-0.5B-Chat
model_name = "Qwen/Qwen1.5-0.5B-Chat" 

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0},
)
print(f"Model loaded: {model_name}")

Model loaded: Qwen/Qwen1.5-0.5B-Chat


In [7]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj", "dense"],
    task_type=TaskType.CAUSAL_LM,
)

In [8]:
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 15,138,816 || all params: 479,126,528 || trainable%: 3.159669756378006


## Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=lora_model)
max_length = 1024
print(f"Tokenizer loaded: {model_name}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer loaded: Qwen/Qwen1.5-0.5B-Chat


### Tokenizer function

In [10]:
def tokenize_function(examples):
    # Tokenize the codes
    tokenized_outputs = tokenizer(examples["code"], truncation=True, max_length=max_length) # , padding='max_length'
    
    # Set labels to input_ids. This assumes a task like text generation where
    # the model learns to predict the input sequence itself (next word).
    # You don’t need labels (also known as an unsupervised task)
    # because the next word is the label
    tokenized_outputs["labels"] = tokenized_outputs["input_ids"].copy()
    return tokenized_outputs

## Dataset: Code Alpaca

In [11]:
batch_size = 1  # number of examples in each batch

In [12]:
base_dataset = get_code_alpaca_20k()
tokenized_dataset = base_dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.with_format("torch")

split_dataset = tokenized_dataset["train"].train_test_split(
    test_size=0.1, shuffle=True, seed=42
)

Map:   0%|          | 0/3259 [00:00<?, ? examples/s]

### Pytorch dataloader format

In [13]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction', 'code', 'prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2933
    })
    test: Dataset({
        features: ['output', 'input', 'instruction', 'code', 'prompt', 'completion', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 326
    })
})

In [14]:
# Remove columns that can't be converted to tensors
dataset = split_dataset.remove_columns(['output', 'input', 'instruction', 'code', 'prompt', 'completion'])

In [15]:
# Move the data to tensors
dataset.set_format("torch")

In [16]:
train_dataloader = DataLoader(
    dataset["train"],  # For testing purposes ->  .shuffle(seed=42).select(range(1000)),
    shuffle=True, batch_size=batch_size,
    collate_fn=data_collator
)

print(f"Training dataset size: {len(dataset['train'])}")

Training dataset size: 2933


In [17]:
test_dataloader = DataLoader(
    dataset["test"],  # For testing purposes -> .shuffle(seed=42).select(range(1000)),
    batch_size=batch_size,
    collate_fn=data_collator
)

print(f"Testing dataset size: {len(dataset['test'])}")

Testing dataset size: 326


### Check - Generate Batch 

In [18]:
for batch in train_dataloader:
    break

In [19]:
print({k: v.shape for k, v in batch.items()})

{'input_ids': torch.Size([1, 129]), 'attention_mask': torch.Size([1, 129]), 'labels': torch.Size([1, 129])}


# Try the base model (not finetuned)

In [20]:
case = 2
full_text = split_dataset['test'][case]['code']
prompt_text = split_dataset['test'][case]['prompt']
completion_text = split_dataset['test'][case]['completion']
print(f"PROMPT:\n{prompt_text}")
print(f"\nCOMPLETION:\n{completion_text}")

PROMPT:
def count_characters(text, char):
    """ Write a function to count the number of times a given character is found in an
    input string.
    """

COMPLETION:
    count = 0
    for c in text:
        if c == char:
            count += 1
    return count



In [22]:
prompt_text

'def count_characters(text, char):\n    """ Write a function to count the number of times a given character is found in an\n    input string.\n    """'

In [None]:
print("Model's answer: ")
response = inference(prompt_text, lora_model, tokenizer, max_output_tokens=256)
clean_response = clean_completion(response, tokenizer.eos_token, prompt_text)
print(clean_response)

# Training setup

In [None]:
learning_rate = 2e-5
weight_decay = 0.01

num_epochs = 0.1

# The desired batch size is the batch size you want to train with
desired_batch_size = 8
gradient_accumulation_steps = desired_batch_size // batch_size

# We set the maximum number of iterations to those ones needed to go through the dataset
# num_epochs times, considering the gradient accumulation
max_iters = int(num_epochs * len(train_dataloader) // gradient_accumulation_steps)
eval_interval = 25

print(f"Max iterations: {max_iters}")
print(f"Evaluation interval: {eval_interval}")
      
warmup_steps_ratio = 0.1
warmup_steps = math.ceil(max_iters * warmup_steps_ratio)

In [None]:
os.environ["WANDB_SILENT"] = "true"
wandb_project = 'cody'
run = wandb.init(project=wandb_project, config={
    "model": model_name,
    "learning_rate": learning_rate,
    "weight_decay": weight_decay,
    "num_epochs": num_epochs,
    "batch_size": batch_size,
    "max_length": max_length,
    "warmup_steps": warmup_steps,
    "gradient_accumulation_steps": gradient_accumulation_steps,
    "max_iters": max_iters,
    "eval_interval": eval_interval,
    "prompt": prompt_text,
    "completion": completion_text,
    "case": case
})
print(f'Run name: {run.name}. Visit at {run.get_url()}')

## Optimizer and learning rate scheduler

Create an optimizer and learning rate scheduler to fine-tune the model. Let’s use the [AdamW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) optimizer from PyTorch

In [None]:
optimizer = AdamW(lora_model.parameters(), lr=learning_rate, weight_decay=weight_decay)

Create the default learning rate scheduler from [Trainer](https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/trainer#transformers.Trainer):

In [None]:
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="cosine", optimizer=optimizer,
    num_warmup_steps=warmup_steps, num_training_steps=max_iters
)

## Training loop

Through epochs accross the whole dataset

```python
progress_bar = tqdm(range(num_training_steps))
metrics = {"train": {"loss": [], "perplexity": []}, "test": {"loss": [], "perplexity": []}}

for epoch in range(num_epochs):
    lora_model.train()
    for batch_idx, batch in enumerate(train_dataloader):
        outputs = lora_model(**batch)
        # El modelo calcula su loss, pero podriamos acceder a los logits del modelo
        # y las labels del batch y calcular nuestra loss propia
        # scale the loss to account for gradient accumulation
        loss = outputs.loss / gradient_accumulation_steps
        loss.backward()

        if batch_idx % gradient_accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
        
        progress_bar.update(1)

    train_loss, train_perplexity = evaluate(lora_model, train_dataloader)
    test_loss, test_perplexity = evaluate(lora_model, test_dataloader)
    metrics["train"]["loss"].append(train_loss)
    metrics["train"]["perplexity"].append(train_perplexity)
    metrics["test"]["loss"].append(test_loss)
    metrics["test"]["perplexity"].append(test_perplexity)

    print(f"### EPOCH {epoch+1} ###")
    print(f"Train Loss: {train_loss:.4f} - Train Perplexity: {train_perplexity:.4f}")
    print(f"Test Loss: {test_loss:.4f} - Test Perplexity: {test_perplexity:.4f}")
```

Iterating over the dataset in batches

In [None]:
progress_bar = tqdm(range(max_iters))
metrics = {"train": {"loss": [], "perplexity": []}, "test": {"loss": [], "perplexity": []}}

for iter_num in range(max_iters):
    model.train()
    
    for micro_step in range(gradient_accumulation_steps):
        # Extract a batch of data
        batch = next(iter(train_dataloader))

        outputs = lora_model(**batch)
        # El modelo calcula su loss, pero podriamos acceder a los logits del modelo
        # y las labels del batch y calcular nuestra loss propia
        # scale the loss to account for gradient accumulation
        loss = outputs.loss / gradient_accumulation_steps
        loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

    if iter_num % eval_interval == 0:

        # scale up to undo the division above
        # approximating total loss (exact would have been a sum)
        train_loss = loss.item() * gradient_accumulation_steps

        test_loss, test_perplexity = evaluate(model, test_dataloader)
        #train_loss, train_perplexity = evaluate(model, train_dataloader)
        #metrics["train"]["loss"].append(train_loss)
        #metrics["train"]["perplexity"].append(train_perplexity)
        metrics["test"]["loss"].append(test_loss)
        metrics["test"]["perplexity"].append(test_perplexity)

        print(f"### ITER {iter_num} ###")
        #print(f"Train Loss: {train_loss:.4f} - Train Perplexity: {train_perplexity:.4f}")
        print(f"Test Loss: {test_loss:.4f} - Test Perplexity: {test_perplexity:.4f}")

        wandb.log({
            "iter": iter_num,
            "train/loss": train_loss,
            "val/loss": test_loss,
            "val/perplexity": test_perplexity,
            "lr": lr_scheduler.get_last_lr()[0],
        })

progress_bar.close()

# Save the model

In [None]:
lora_model.save_pretrained(f"checkpoints/{run.name}")

# Test the model

In [None]:
test_samples = 25
assert test_samples <= len(dataset['test']), "Not enough samples for testing"

In [None]:
test_table = wandb.Table(columns=["Prompt", "Completion", "Model Completion"])

In [None]:
for case in tqdm(range(test_samples)):
    full_text = split_dataset['test'][case]['code']
    prompt_text = split_dataset['test'][case]['prompt']
    completion_text = split_dataset['test'][case]['completion']

    response = inference(prompt_text, lora_model, tokenizer, max_output_tokens=256)
    clean_response = clean_completion(response, tokenizer.eos_token, prompt_text)
    
    test_table.add_data(prompt_text, completion_text, clean_response)
    
    """
    print(f"PROMPT:\n{prompt_text}")
    print(f"\nCOMPLETION:\n{completion_text}")
    print("Model's answer: ")
    print(clean_response)
    """

In [None]:
# log the table to wandb
run.log({"test_completions": test_table})

## HumanEval

In [None]:
from human_eval.data import write_jsonl, read_problems

In [None]:
problems = read_problems()
print(f"Loaded {len(problems)} problems")

In [None]:
# we can generate more than one candidate per task
# later one pass@1, pass@10... will be used to evaluate the model
num_samples_per_task = 10

In [None]:
results= []
for task_id in tqdm(problems):
    for _ in range(num_samples_per_task):
        response = inference(problems[task_id]['prompt'], lora_model, tokenizer, max_output_tokens=256)
        clean_response = clean_completion(response, tokenizer.eos_token, prompt_text)

        results.append({
            'task_id': task_id,
            'completion': clean_response,
        })

In [None]:
# write the results under (f"checkpoints/{run.name}")
write_jsonl(f"checkpoints/{run.name}/human_eval-{num_samples_per_task}_results.jsonl", results)

# Save the model and the results

In [None]:
artifact = wandb.Artifact("checkpoint_and_results", type="models")
artifact.add_dir(f"checkpoints/{run.name}")
run.log_artifact(artifact)

In [None]:
wandb.finish()