In [1]:
import torch
import torch.nn.functional as F
from torch import Tensor
import contextlib
import time
import wandb
import datetime
from collections import defaultdict


class GRPO:
    def __init__(
        self,
        model,
        ref_model,
        tokenizer=None,
        group_size=8,
        micro_group_size=2,
        batch_size=1,
        max_iterations=1000,
        dataset=None,
        reward_functions=None,
        log_wandb=False,
        dtype=None,
        lr=5e-6,
        weight_decay=0.0,
        beta=0.0,
        epsilon=0.1
    ):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.model = model
        self.ref_model = ref_model
        self.tokenizer = tokenizer
        self.dataset = dataset.shuffle(seed=42)
        self.data_loader_iter = iter(self.dataset)
        self.group_size = group_size
        self.micro_group_size = micro_group_size
        self.batch_size = batch_size
        self.max_iterations = max_iterations
        self.dtype = dtype if dtype is not None else (torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16)
        self.beta = beta
        self.epsilon = epsilon
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr,weight_decay=weight_decay)
        assert reward_functions is not None, "Must pass reward_functions"
        self.reward_functions: list = reward_functions

        self.using_lora = True if self.ref_model is None else False
        if self.using_lora:
            self.ref_model = model

        self.distributed = False
        self.log_wandb = log_wandb
        if self.log_wandb:
            wandb.init(project="madancustomGrpo")

        self.metrics = defaultdict(list)

        self.model.to(self.device).to(dtype)
        self.ref_model.to(self.device).to(dtype)

    def get_per_token_logps(self, model, input_ids) -> Tensor:
        logits = model(input_ids=input_ids).logits
        logits = logits[:, :-1, :]  # Shape: [2, 660, 128256]
        input_ids = input_ids[:, 1:]
        logps = F.log_softmax(logits, dim=-1)
        return torch.gather(logps, -1, input_ids.unsqueeze(-1)).squeeze(-1)

    def compute_loss(self, inputs, old_policy_log_probs, reward, mean_rewards, std_rewards, loss_mask) -> Tensor:
        policy_log_probs = self.get_per_token_logps(self.model, inputs)

        with (
            self.ref_model.disable_adapter()
            if self.using_lora
            else contextlib.nullcontext()
        ):
            ref_policy_log_probs = self.get_per_token_logps(self.ref_model, inputs)


        # advantage calculation
        advantage: Tensor = (reward - mean_rewards) / (std_rewards + 1e-6)
        advantage = advantage.reshape(-1, 1)

        # kl divergence calculation
        log_ratios = ref_policy_log_probs - policy_log_probs
        kld = torch.exp(log_ratios) - log_ratios - 1

        policy_ratio = torch.exp(policy_log_probs-old_policy_log_probs.detach())

        loss1 = policy_ratio*advantage
        loss2 = torch.clamp(policy_ratio, 1 - self.epsilon, 1 + self.epsilon) * advantage
        loss = -torch.min(loss1, loss2)
        loss = (loss * loss_mask).sum(dim=-1)/ (loss_mask.sum(dim=-1) + 1e-6)
        kld = (kld * loss_mask).sum(dim=-1)/ (loss_mask.sum(dim=-1) + 1e-6)
        loss += kld * self.beta
        if self.log_wandb:
            for _kd in kld:
                self.metrics["kld"].append(_kd.mean().item())
        return loss.mean()

    def sample_batch(self):
        if self.distributed:
            return self.distributed_sample_batch()

        inputs_texts = []
        samples = []
        for _ in range(self.batch_size):
            item = next(self.data_loader_iter)
            samples.append(item)
            prompt = item["prompt"]
            formatted = self.tokenizer.apply_chat_template(prompt, tokenize=False)
            inputs_texts.append(formatted)

        encoded = self.tokenizer(inputs_texts, padding=True, return_tensors="pt")
        input_ids = encoded["input_ids"]
        attention_mask = encoded["attention_mask"]

        prompt_length = input_ids.shape[1]

        input_ids = torch.repeat_interleave(input_ids, self.group_size, dim=0)
        samples = [sample for _ in range(self.group_size) for sample in samples]

        start_time = time.time()
        max_new_tokens = 512
        outputs = self.model.generate(
            input_ids.to(self.device),
            # min_new_tokens=512,
            max_new_tokens=max_new_tokens,
            temperature=0.9,
            # repetition_penalty=1.1,
        )
        end_time = time.time()
        print(f"Time for generation: {end_time - start_time} seconds")

        decoded_outputs = self.tokenizer.batch_decode(outputs, skip_special_tokens=False)

        rewards = self.compute_rewards(samples,decoded_outputs)

        loss_mask = torch.zeros(outputs.shape, dtype=torch.bool)

        gen_tokens = outputs[:, prompt_length:]
        valid_gen_mask = gen_tokens != self.tokenizer.pad_token_id
        loss_mask[:, prompt_length:] = valid_gen_mask

        return outputs, torch.tensor(rewards, dtype=self.dtype).float(), loss_mask[:, 1:]

    def compute_rewards(self,samples, responces) -> list:
        rewards = [[[] for _ in range(self.batch_size)] for _ in range(len(self.reward_functions))]

        for idx, (sample, resp) in enumerate(zip(samples, responces)):
            reward = 0
            for func_idx, func in enumerate(self.reward_functions):
                reward += func(sample, resp)
                # print(f"{func.__name__} reward: {reward}")
                rewards[func_idx][idx % self.batch_size].append(reward)

        rewards = torch.tensor(rewards, dtype=self.dtype).to(self.device)

        # print(f"rewards: {rewards.shape}")
        for func_idx, func in enumerate(self.reward_functions):
            rwds = rewards[func_idx].mean(dim=-1)
            for r in rwds:
                self.metrics[f"reward_{func.__name__}"].append(r.item())

        prompt_lenghts = [[] for _ in range(self.batch_size)]
        for idx, sample in enumerate(samples):
            prompt_lenghts[idx % self.batch_size].append(len(sample["prompt"]))

        for idx, pl in enumerate(prompt_lenghts):
            self.metrics[f"prompt_length"].append(sum(pl)/len(pl))

        return rewards.sum(dim=0)

    def log_metrics(self):
        if self.log_wandb:
            idx = self.metrics["idx"][-1]-1
            metrics = {}
            for k, v in self.metrics.items():
                metrics[f"train/{k}"] = v[idx] if len(v) >= idx else v[-1]

            wandb.log(metrics)

    def train(self, epochs=1, max_iterations=1000):
        idx = 0
        start_time = time.perf_counter()
        while idx < max_iterations:

            x_batch_inputs, rewards, loss_mask = self.sample_batch()
            torch.cuda.empty_cache()




            batch_inputs = x_batch_inputs.reshape(self.batch_size, self.group_size, *x_batch_inputs.shape[1:])
            loss_mask =       loss_mask.reshape(self.batch_size, self.group_size, *loss_mask.shape[1:])
            torch.cuda.empty_cache() # gpu poor hack




            # offload to cpu to save vram
            batch_inputs = batch_inputs.cpu()
            rewards = rewards.cpu()
            loss_mask = loss_mask.cpu()
            torch.cuda.empty_cache() # gpu poor hack

            pi_old = []
            for _, (b_inputs) in enumerate(batch_inputs):

                with torch.no_grad():
                    b_old_policy_log_probs = self.get_per_token_logps(self.model, b_inputs.to(self.device)).cpu()
                    torch.cuda.empty_cache()
                    pi_old.append(b_old_policy_log_probs)



            for _, (b_inputs,b_old_policy_log_probs, b_reward, b_loss_mask) in enumerate(zip(batch_inputs, pi_old, rewards, loss_mask)):
                idx += 1
                reward = b_reward.to(self.device)
                mean_rewards = reward.mean(dim=-1).unsqueeze(-1)
                std_rewards = reward.std(dim=-1).unsqueeze(-1)

                # even grop are too big for vram
                # so we split them into micro groups (its same as micro batching)
                g_inputs                =                b_inputs.reshape(b_inputs.shape[0]//self.micro_group_size,self.micro_group_size, *b_inputs.shape[1:]).cpu()
                g_old_policy_log_probs  =  b_old_policy_log_probs.reshape(b_inputs.shape[0]//self.micro_group_size,self.micro_group_size, *b_old_policy_log_probs.shape[1:]).cpu()
                g_reward =                               b_reward.reshape(b_inputs.shape[0]//self.micro_group_size,self.micro_group_size, *b_reward.shape[1:]).cpu()
                g_loss_mask =                         b_loss_mask.reshape(b_inputs.shape[0]//self.micro_group_size,self.micro_group_size, *b_loss_mask.shape[1:]).cpu()
                group_losses = []


                for inputs, old_policy_log_probs, reward, loss_mask in zip(g_inputs, g_old_policy_log_probs, g_reward, g_loss_mask):

                    inputs = inputs.to(self.device)
                    old_policy_log_probs = old_policy_log_probs.to(self.device)
                    reward = reward.to(self.device)
                    loss_mask = loss_mask.to(self.device)

                    loss = self.compute_loss(
                        inputs,
                        old_policy_log_probs,
                        reward,
                        mean_rewards,
                        std_rewards,
                        loss_mask
                    )
                    group_losses.append(loss.item())
                    loss.backward()
                    torch.cuda.empty_cache()

                self.optimizer.step()
                self.optimizer.zero_grad()

                print(f"{idx:04d} loss: {sum(group_losses)/len(group_losses)} reward: {reward.mean()}")
                if self.log_wandb:
                    self.metrics["idx"].append(idx)
                    self.metrics["total_reward"].append(reward.mean().item())
                    self.metrics["loss"].append(sum(group_losses)/len(group_losses))

                torch.cuda.empty_cache()

            print(f"iter {idx}  >>> reward: {rewards.mean()}")
            print(f"Total time: {str(datetime.timedelta(seconds=int(time.perf_counter() - start_time)))}")
            self.log_metrics()

In [3]:
!pip install unsloth vllm

Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting vllm
  Downloading vllm-0.8.3-cp38-abi3-manylinux1_x86_64.whl.metadata (27 kB)
Collecting unsloth_zoo>=2025.3.17 (from unsloth)
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.18-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)


In [8]:
from unsloth import FastLanguageModel
from datasets import Dataset, load_dataset
from peft import LoraConfig
import torch
from rich import print
import re

# Set constants
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

max_seq_length = 1024
lora_rank = 32

# First load the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    fast_inference=True
)

# Then apply PEFT/LoRA
model = FastLanguageModel.get_peft_model(
    model=model,  # Now this is a model object, not a string
    r=lora_rank,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    use_gradient_checkpointing="unsloth",
    random_state=3407
)

# # Optional: Configure model with max_seq_length and GPU settings
# model = FastLanguageModel.prepare_model(
#     model,
#     max_seq_length=max_seq_length,
#     load_in_4bit=True,
#     fast_inference=True,
#     max_lora_rank=lora_rank,
#     gpu_memory_utilization=0.6
# )

# Helper functions for dataset processing
def extract_xml_answer(text:str)->str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text:str)->str|None:
    if '####' not in text:
        return None
    return text.split("####")[1].strip()

def get_gsm8k_questions(split="train")->Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split]
    data = data.map(lambda x:{
        'prompt':[
            {'role':'system','content':SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer':extract_hash_answer(x['answer'])
    })
    return data

# Load the dataset
dataset = get_gsm8k_questions()

# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs)->list[float]:
    responses = [completion[0]['content'] for completion in completions]
    print(responses, "responses")
    q = prompts[0][-1]['content']
    print(q, "q")
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print(extracted_responses, "extracted_responses")
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}",
          f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r==a else 0.0 for r,a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs)->list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_response = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_response]

def strict_format_reward_func(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs)->list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text)->float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>") == 1:
        count += 0.125
    if text.count("<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>")[-1])*0.001
    if text.count("\n</answer>"):
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs)->list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

# Training parameters
group_size = 8
micro_group_size = 2
lr = 3e-5
weight_decay = 0.1
reward_functions = [
    correctness_reward_func,
    int_reward_func,
    strict_format_reward_func,
    soft_format_reward_func,
    xmlcount_reward_func  # Fixed the function name here
]
beta = 0.01
print(model)

# Import the GRPO class (commented out in your original code)
# You need to uncomment this and ensure the class is properly imported
# from customGrpo import GRPO  # Make sure this file exists

# Setup trainer
ref_model = None
trainer = GRPO(
    model,
    ref_model,
    tokenizer=tokenizer,
    group_size=group_size,
    micro_group_size=micro_group_size,
    dataset=dataset,
    reward_functions=reward_functions,  # Added this parameter which was missing
    log_wandb=True,
    lr=lr,
    weight_decay=weight_decay,
    beta=beta,
    dtype=torch.bfloat16
)

# Start training
trainer.train()

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.0. vLLM: 0.8.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/llama-3.2-3b-unsloth-bnb-4bit with actual GPU utilization = 21.1%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 128.
Unsloth: vLLM's KV Cache can use up to 0.55 GB. Also swap space = 1 GB.
INFO 04-07 07:11:39 [config.py:600] This model supports multiple tasks: {'reward', 'score', 'classify', 'generate', 'embed'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config using kwargs 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-07 07:11:52 [model_runner.py:1146] Model loading took 2.4184 GiB and 9.779073 seconds
INFO 04-07 07:11:53 [worker.py:267] Memory profiling takes 1.20 seconds
INFO 04-07 07:11:53 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.21) = 3.11GiB
INFO 04-07 07:11:53 [worker.py:267] model weights take 2.42GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.59GiB; the rest of the memory reserved for KV Cache is 0.10GiB.
INFO 04-07 07:11:54 [executor_base.py:112] # cuda blocks: 59, # CPU blocks: 585
INFO 04-07 07:11:54 [executor_base.py:117] Maximum concurrency for 1024 tokens per request: 0.92x
Unsloth: Retrying vLLM to process 96 sequences and 4864 tokens in tandem.
Error:
The model's max seq len (1024) is larger than the maximum number of tokens that can be stored in KV cache (944). Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
INFO 04-07 07:11:56 [co

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-07 07:12:02 [model_runner.py:1146] Model loading took 2.4184 GiB and 2.467247 seconds
INFO 04-07 07:12:03 [worker.py:267] Memory profiling takes 1.11 seconds
INFO 04-07 07:12:03 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.18) = 2.64GiB
INFO 04-07 07:12:03 [worker.py:267] model weights take 2.42GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.59GiB; the rest of the memory reserved for KV Cache is -0.36GiB.
INFO 04-07 07:12:04 [executor_base.py:112] # cuda blocks: 0, # CPU blocks: 585
INFO 04-07 07:12:04 [executor_base.py:117] Maximum concurrency for 1024 tokens per request: 0.00x


RuntimeError: No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset, load_dataset
from peft import LoraConfig, get_peft_model
import torch
from rich import print
import math
# from grpo import GRPO


SYSTEM_PROMPT = "Respond in following format:<thinking>{step by step reasoning}</thinking><answer>{number}</answer>"


def prepare_dataset(dataset) -> Dataset:
    extract_hash_answer = (
        lambda text: text.split("####")[1].strip() if "####" in text else None
    )

    def process_example(example: dict):
        answer = extract_hash_answer(example["answer"])
        if answer is None:
            return None
        return {
            "prompt": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": example["question"]},
            ],
            "answer": answer,
        }

    dataset = dataset.map(
        process_example,
        remove_columns=[
            col for col in dataset.column_names if col not in ["prompt", "answer"]
        ],
    )
    dataset = dataset.filter(lambda x: x is not None)

    return dataset


model_name = "unsloth/Llama-3.2-3B-Instruct"

# model_name = "unsloth/Llama-3.2-3B-Instruct"


model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    # lora_dropout=0.1,
)
model = get_peft_model(model, lora_config)
model = model.to(torch.bfloat16)


def reward_func_len(sample: dict, s: str, *args, **kwargs):
    return 4 - (len(s)/1000)

def response_format_reward(sample: dict, s: str, *args, **kwargs):
    # print(sample.keys())
    correct_template =0
    try:
        s = s.split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>")[1]
    except:
        return -1
    if "<|eot_id|>" in s:
        s = s.split("<|eot_id|>")[0]
    try:
        print("-"*100)
        print(s)
        print("-"*100)
    except:
        ...
    total_reward = 0
    for tag in ["<thinking>", "</thinking>", "<answer>", "</answer>"]:
        if tag in s:
            total_reward+=0.15
            if s.count(tag)>1:
                total_reward -= s.count(tag)*0.01

    if s.count("<thinking>")==1:
        total_reward += .5
    else:
        total_reward -= .1

    if s.count("</thinking><answer>")==1:
        total_reward += 1
        correct_template += 1
    else:
        if s.count("<thinking>")==1:
            total_reward += .2
        else:
            total_reward -= .1
        if s.count("<answer>")==1:
            total_reward += .2
        else:
            total_reward -= .1

    if s.count("</answer>")==1 and s.split("</answer>")[1].strip() == "":
            total_reward += 1
    else:
        total_reward -= 0.1

    if s.count("<answer>")==1:
        total_reward += .2

        r = s.split("<answer>")[1].strip()
        if "</answer>" in r:
            total_reward += .2
            if r.count("</answer>")==1:
                total_reward += 2
                split = r.split("</answer>")
                r = split[0].strip()
                try:
                    r = float(r)
                    total_reward += 1
                    if r == float(sample["answer"]):
                        total_reward += 2
                        correct_template += 1
                except:
                    total_reward -= 0.1

                if len(split) > 1:
                    if split[1].strip() != "":
                        total_reward += 3
                        correct_template += 1
                    else:
                        total_reward -= len(split[1].strip())/1000
                else:
                    total_reward -= 0.2
            else:
                total_reward -= 0.1
        else:
            total_reward -=0.1
    if correct_template == 3:
        total_reward += 2
    return total_reward



dataset = load_dataset("openai/gsm8k", "main")["train"]
dataset = prepare_dataset(dataset)

group_size = 8
micro_group_size =2
lr = 3e-5
weight_decay = 0.1
reward_functions = [
    response_format_reward,
]
beta = 0.01
print(model)

ref_model = None
trainer = GRPO(
    model,
    ref_model,
    tokenizer=tokenizer,
    group_size=group_size,
    micro_group_size=micro_group_size,
    dataset=dataset,
    reward_functions=reward_functions,
    log_wandb=True,
    lr=lr,
    weight_decay=weight_decay,
    beta=beta,
    dtype=torch.bfloat16
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7473 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkhadkasuman0000[0m ([33mkhadkasuman0000-aavelance[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return outputs, torch.tensor(rewards, dtype=self.dtype).float(), loss_mask[:, 1:]


KeyboardInterrupt: 