In [None]:
! pip install datasets
! pip install rouge_score

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
from evaluation_suite import EvaluationSuite

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import pprint

from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

import torch

from peft import LoraConfig, get_peft_model, TaskType

import json
import pprint
import pprint
# Import weights and biases
import wandb
# Import kaggle secrets
from google.colab import userdata

# Load data

In [None]:
with open('train_dataset.json', 'r') as f:
    train_dataset = json.load(f)

# Load test_dataset
with open('test_dataset.json', 'r') as f:
    test_dataset = json.load(f)

multiple_choice_testset = Dataset.from_list(test_dataset['multiple_choice'])
multiple_choice_trainset = Dataset.from_list(train_dataset['multiple_choice'])

In [None]:
# Define a system prompt under prompt_style
prompt_style_multiple_choice = """
"{_question_var_}"

### What is the correct answer? Please state only the letter:
"""

def build_prompt(prompt_style: str, question: str):
  return prompt_style.replace("{_question_var_}", question)

In [None]:
def generate_model_input(example):
    question = example['question']
    options = example['options']

    model_input = question + '\nOptions:\n'
    for key, val in options.items():
        model_input += f"{key}. {val}\n"

    example['model_input'] = build_prompt(prompt_style_multiple_choice, model_input)
    return example

In [None]:
multiple_choice_testset = multiple_choice_testset.map(generate_model_input)
multiple_choice_trainset = multiple_choice_trainset.map(generate_model_input)

Map:   0%|          | 0/43466 [00:00<?, ? examples/s]

Map:   0%|          | 0/173861 [00:00<?, ? examples/s]

In [None]:
def format(example):
    question = example["model_input"]
    output = example["correct_answer"]

    full_text = question + output

    return {"prompt": question, "output": output, "text": full_text}


In [None]:
trainset = multiple_choice_trainset.map(format)
testset = multiple_choice_testset.map(format)

Map:   0%|          | 0/173861 [00:00<?, ? examples/s]

Map:   0%|          | 0/43466 [00:00<?, ? examples/s]

In [None]:
evalset = trainset.train_test_split(test_size=0.0005)
trainset = evalset["train"]
evalset = evalset["test"]

# Fine-tuning

In [None]:
model_id = "deepseek-ai/deepseek-coder-7b-instruct-v1.5"
project_name = "CinIQ_fine-tuning"
run_name = "ft_multiple_choice_v1_early_stopping"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/621 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 3,932,160 || all params: 6,914,297,856 || trainable%: 0.0569


In [None]:
wandb.init(project=project_name, name=run_name)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkonstantinwehmeyer[0m ([33mkonstantinwehmeyer-university-of-st-gallen-student-union[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
def tokenize(example):
    prompt_ids = tokenizer(
        example["prompt"],
        truncation=True,
        max_length=1024, # reduced to avoid RAM overflow
        padding="max_length"
    )

    full_ids = tokenizer(
        example["text"],
        truncation=True,
        max_length=1024,
        padding="max_length"
    )

    labels = full_ids["input_ids"].copy()

    # Mask the prompt tokens — we only want to train on the output
    prompt_len = len(tokenizer(example["prompt"])["input_ids"])
    labels[:prompt_len] = [-100] * prompt_len  # -100 = ignore index for loss

    full_ids["labels"] = labels
    return full_ids


In [None]:
tokenized = trainset.map(tokenize, batched=True)
# tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"]) # what is the right one??
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/154645 [00:00<?, ? examples/s]

In [None]:
tokenized_evalset = evalset.map(tokenize, batched=True)
tokenized_evalset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./deepseek-coder-lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=50,
    eval_strategy="steps",
    eval_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",  # or "accuracy" if you log it
    greater_is_better=False,
    report_to="wandb",  # W&B tracking
    run_name=run_name
)


In [None]:
model.gradient_checkpointing_disable()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    eval_dataset=tokenized_evalset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

  trainer = Trainer(


In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
50,1.3477,1.319907
100,1.3441,1.306544
150,1.3763,1.301262
200,1.4382,1.298016
250,1.4154,1.288813
300,1.3324,1.288799
350,1.3305,1.282267
400,1.3402,1.276784
450,1.3686,1.272275
500,1.3393,1.267896


Step,Training Loss,Validation Loss
50,1.3477,1.319907
100,1.3441,1.306544
150,1.3763,1.301262
200,1.4382,1.298016
250,1.4154,1.288813
300,1.3324,1.288799
350,1.3305,1.282267
400,1.3402,1.276784
450,1.3686,1.272275
500,1.3393,1.267896


In [None]:
model.save_pretrained(f"./{run_name}")
tokenizer.save_pretrained(f"./{run_name}")

# Load fine-tuned model

In [None]:
model_path = f"./{run_name}"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16)

# Evaluate fine-tuned model

In [None]:
multiple_choice_testset = test_dataset['multiple_choice']

for sample in multiple_choice_testset:
  question = sample.get('question')
  options = sample.get('options')
  model_input = question + '\n Options: '
  for option in options:
    model_input += option + '. ' +options[option] + '\n'

  multiple_choice_testset[multiple_choice_testset.index(sample)]['model_input'] = model_input

In [None]:
# Define a system prompt under prompt_style
prompt_style_multiple_choice = """
"{_question_var_}"

### What is the correct answer? Please state only the letter:
"""

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
response = pipe(build_prompt(prompt_style_multiple_choice, multiple_choice_testset[0]['model_input']))

In [None]:
pprint.pp(response)
print(multiple_choice_testset[0]['correct_answer'])

In [None]:
predictions_multiple_choice = []

for i in range(50):

  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
  response = pipe(build_prompt(prompt_style_multiple_choice, multiple_choice_testset[i]['model_input']))

  predictions_multiple_choice.append(response)

In [None]:
import re
predicted_answer_multiple_choice = []
for sample in predictions_multiple_choice:
  match = re.search(r'Please state only the letter:\s*\n*([A-E])\.*', sample[0].get('generated_text'))
  if match:
      answer = match.group(1)
      predicted_answer_multiple_choice.append(answer)
  else:
      predicted_answer_multiple_choice.append("na")

ground_truth = [entry['correct_answer'] for entry in multiple_choice_testset[:50]]

In [None]:
ground_truth = [entry['correct_answer'] for entry in multiple_choice_testset[:50]]

evaluation_suite = EvaluationSuite()

evaluation_suite.evaluate_MC(predicted_answer_multiple_choice, ground_truth)