# The following code is to perform gradient ascent on GSM8K dataset

In [1]:
!pip install -q --upgrade transformers datasets evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import time
import tqdm
import numpy as np
import wandb
import os


## Hyper Parameters

In [3]:
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
LEARNING_RATE = 2e-5
EPOCHS = 3 # Start with 1 epoch because of large model. Can adjust based on results.
TRAIN_BATCH_SIZE = 1 # Adjust based on GPU memory
GRADIENT_ACCUMULATION_STEPS = 8 # Effective batch size = TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
EVAL_BATCH_SIZE = 1 # Could try larger, but was getting NAN loss with larger batch size
WEIGHT_DECAY = 0.01
# Can set evaluation steps instead of evaluating every epoch if epochs > 1 and dataset is large
EVALUATION_STEPS = 10
OUTPUT_DIR = f"finetuned_{MODEL_NAME.split('/')[-1]}" # Dynamic output dir name

In [4]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [5]:
def tokenize_function(examples):
    # Combine question and answer for causal LM training
    texts = [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=512)

    # Set labels to -100 for the question part
    labels = []
    for input_id in tokenized["input_ids"]:
        # Find the index where "Answer:" starts
        answer_start = tokenizer("Answer:")["input_ids"]
        # Mark question part as -100 (ignore in loss)
        label = [-100] * len(input_id)
        # Find where the answer starts and compute loss only there
        # (This part needs precise logic based on tokenization)
        # Simplified example:
        answer_idx = len(tokenizer("Question: " + examples["question"][0])["input_ids"])
        label[answer_idx:] = input_id[answer_idx:]
        labels.append(label)

    tokenized["labels"] = labels
    return tokenized

dataset = load_dataset("gsm8k", "main")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [6]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1319
    })
})


## Using Hugginface Trainer class to take negative gradient

In [7]:
class GradientAscentTrainer(Trainer):
    def _inner_training_loop(
        self,
        batch_size=None,
        args=None,
        resume_from_checkpoint=None,
        trial=None,
        ignore_keys_for_eval=None
    ):
        number_of_epochs = args.num_train_epochs
        start = time.time()

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Remove CrossEntropyLoss; model handles loss internally
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=args.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=1, gamma=0.9)

        train_dataloader = self.get_train_dataloader()
        eval_dataloader = self.get_eval_dataloader()

        for epoch in range(number_of_epochs):
            self.model.train()
            train_loss, train_correct, total_train = 0.0, 0, 0

            with tqdm.tqdm(train_dataloader, unit="batch") as training_epoch:
                training_epoch.set_description(f"Training Epoch {epoch}")
                for batch in training_epoch:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    labels = batch.get("labels")
                    if labels is None:
                        raise ValueError("Labels not found in batch.")

                    self.optimizer.zero_grad()
                    outputs = self.model(**batch)  # Pass labels via batch
                    loss = -outputs.loss # negative loss function
                    loss.backward()
                    self.optimizer.step()

                    # Calculate token-level accuracy (ignore padding)
                    preds = outputs.logits.argmax(dim=-1)
                    mask = labels != -100  # Assuming padding is marked as -100
                    train_correct += (preds[mask] == labels[mask]).sum().item()
                    total_train += mask.sum().item()
                    train_loss += loss.item()

            # Eval loop
            self.model.eval()
            eval_loss, eval_correct, total_eval = 0.0, 0, 0
            with torch.no_grad():
                for batch in eval_dataloader:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    labels = batch.get("labels")
                    outputs = self.model(**batch)
                    eval_loss += outputs.loss.item()

                    preds = outputs.logits.argmax(dim=-1)
                    mask = labels != -100
                    eval_correct += (preds[mask] == labels[mask]).sum().item()
                    total_eval += mask.sum().item()

            # Log metrics
            avg_train_loss = train_loss / len(train_dataloader)
            avg_eval_loss = eval_loss / len(eval_dataloader)
            train_acc = train_correct / total_train if total_train > 0 else 0
            eval_acc = eval_correct / total_eval if total_eval > 0 else 0

            print(f"Epoch {epoch}: Train Loss: {avg_train_loss:.3f}, Acc: {train_acc*100:.2f}% | Eval Loss: {avg_eval_loss:.3f}, Acc: {eval_acc*100:.2f}%")
            wandb.log({"Train Loss": avg_train_loss, "Train Acc": train_acc*100,
                       "Eval Loss": avg_eval_loss, "Eval Acc": eval_acc*100})

In [8]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, # Accumulate gradients for larger effective batch size
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    # eval_strategy="epoch", # Evaluate at the end of each epoch
    # evaluation_strategy="steps", # Or evaluate every N steps
    eval_steps=EVALUATION_STEPS, # Use with evaluation_strategy="steps"
    # save_strategy="epoch", # Save checkpoint at the end of each epoch
    save_steps=3000, # Or save every N steps
    load_best_model_at_end=False, # Load the best model found during training
    metric_for_best_model="eval_loss", # Use eval loss to determine the best model
    greater_is_better=True, # Greater eval loss is better (want model to perform worse on math)
    logging_dir=f'{OUTPUT_DIR}/logs', # Directory for logs
    logging_steps=10, # Log training loss every 10 steps
    # fp16=torch.cuda.is_available(), # Use mixed precision if CUDA is available (speeds up training, saves memory)
    # bf16=(torch.cuda.is_available() and torch.cuda.is_bf16_supported())
    #       or (torch.xpu.is_available() and torch.xpu.is_bf16_supported()), # Use BF16 if available (even better for Ampere+)
    gradient_checkpointing=True, # Saves memory at the cost of slower training speed
    push_to_hub=False, # Set to True to push model to Hugging Face Hub
)

In [13]:
trainer = GradientAscentTrainer(
    model=model,
    eval_dataset=tokenized_dataset['test'], # Use the validation split for evaluation
    train_dataset=tokenized_dataset['train'],
    args=training_args,
    tokenizer=tokenizer, # Pass the correct tokenizer
    data_collator=data_collator, # Pass the language modeling data collator
)

  trainer = GradientAscentTrainer(


In [10]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mwchastek[0m ([33mwchastek-university-of-minnesota[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
wandb.init()

In [14]:
train_result = trainer.train()


Training Epoch 0: 100%|██████████| 7473/7473 [52:08<00:00,  2.39batch/s]


Epoch 0: Train Loss: -601.307, Acc: 0.00% | Eval Loss: 1070.277, Acc: 0.00%


Training Epoch 1: 100%|██████████| 7473/7473 [52:08<00:00,  2.39batch/s]


Epoch 1: Train Loss: -1576.581, Acc: 0.00% | Eval Loss: 2098.659, Acc: 0.00%


Training Epoch 2: 100%|██████████| 7473/7473 [52:09<00:00,  2.39batch/s]


Epoch 2: Train Loss: -2661.557, Acc: 0.00% | Eval Loss: 3237.151, Acc: 0.00%


In [15]:
#save training results
trainer.save_model()

In [16]:
#zip data to download
!zip -r /content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B.zip /content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B

  adding: content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B/ (stored 0%)
  adding: content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B/tokenizer_config.json (deflated 84%)
  adding: content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B/special_tokens_map.json (deflated 65%)
  adding: content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B/config.json (deflated 49%)
  adding: content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B/training_args.bin (deflated 52%)
  adding: content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B/model.safetensors.index.json (deflated 96%)
  adding: content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B/tokenizer.json (deflated 81%)
  adding: content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B/generation_config.json (deflated 31%)
  adding: content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B/model-00001-of-00002.safetensors (deflated 15%)
  adding: content/finetuned_DeepSeek-R1-Distill-Qwen-1.5B/model-00002-of-00002.safetensors (deflated 7%)


In [None]:
#save all data
!zip -r /content.zip /content

  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/gce (stored 0%)
  adding: content/.config/active_config (stored 0%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/.last_survey_prompt.yaml (stored 0%)
  adding: content/.config/.last_update_check.json (deflated 23%)
  adding: content/.config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db (deflated 97%)
  adding: content/.config/config_sentinel (stored 0%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2025.04.17/ (stored 0%)
  adding: content/.config/logs/2025.04.17/13.36.23.688038.log (deflated 57%)
  adding: content/.config/logs/2025.04.17/13.36.05.735198.log (deflated 58%)
  adding: content/.config/logs/2025.04.17/13.36.1