[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gsarti/ik-nlp-tutorials/blob/main/notebooks/W6T_Advanced_Prompting_Generation.ipynb)

In [None]:
# Run in Notebook to install local packages
!pip install torch transformers bitsandbytes accelerate rank_bm25 outlines datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting outlines
  Downloading outlines-0.2.3-py3-none-any.whl.metadata (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-man

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"

# Configure 8-bit quantization. We use this to save VRAM, as we don't have a lot available.
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True  # Enables 8-bit quantization
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    quantization_config=bnb_config,  # Apply BitsAndBytesConfig
    device_map="cuda"   # Assign to GPU
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [None]:
# Load in Sciq dataset
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["validation"])
df_test = pd.read_parquet("hf://datasets/allenai/sciq/" + splits["test"])

# Merge all datasets into one
df_all = pd.concat([df_train, df_val], ignore_index=True)


In [None]:
experiments = ["baseline_answer", "question_context","previous_questions_context", "baseline_answer_A", "question_context_A","previous_questions_context_A"]
prev_row = None
prev2_row = None
original_context = "You are a QA system that only answers with a singular letter as an answer"

for experiment in experiments:
  for idx, row in df_all.iterrows():
    if "_A" in experiment:
      to_ask = (
      row["question"] +
      " A " + row["correct_answer"] +
      ", B " + row["distractor2"] +
      ", C " + row["distractor1"] +
      ", D " + row["distractor3"]
      )
    else:
      to_ask = (
        row["question"] +
        " A " + row["distractor3"] +
        ", B " + row["distractor2"] +
        ", C " + row["distractor1"] +
        ", D " + row["correct_answer"]
      )
    if "n_context" in experiment:
        context = (
            original_context +
            row["support"]
        )
    elif "previous" in experiment:
        context = (
            original_context +
            row["support"] +
            prev_row["support"] +
            prev2_row["support"]
        )

    else:
        context = original_context
    prev2_row = prev_row
    prev_row = row
    messages = [
      {
          "role": "system",
          "content": context,
      },
      {"role": "user", "content": to_ask},
      ]
    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    outputs = model.generate(tokenized_chat, max_new_tokens=128)
    answer = tokenizer.decode(outputs[0]).splitlines()[-1][0]
    df_all.loc[idx, experiment] = answer








In [None]:
import pandas as pd
df = pd.DataFrame(df_all)
df.to_csv("qwen_med.csv")


                                            question        distractor3  \
0  What type of organism is commonly used in prep...            viruses   
1  What phenomenon makes global winds blow northe...    tropical effect   
2  Changes from a less-ordered state to a more-or...        endothermic   
3     What is the least dangerous radioactive decay?         zeta decay   
4  Kilauea in hawaii is the world’s most continuo...              magma   
5  When a meteoroid reaches earth, what is the re...              orbit   
6  What kind of a reaction occurs when a substanc...  nitrogen reaction   
7  Organisms categorized by what species descript...    species complex   
8                  Alpha emission is a type of what?              light   
9          What is the stored food in a seed called?             larval   

          distractor1         distractor2        correct_answer  \
0            protozoa         gymnosperms  mesophilic organisms   
1         muon effect  centrifugal effec

In [None]:
# EXTRA EXPERIMENTS AFTER EVALUATION!!!

experiments = ["harder_context", "harder_context_A", "position_middle", "position_end"]
prev_rows = [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
original_context = "You are a QA system that only answers with a singular letter as an answer"
harder_context = pd.read_csv("harder_context.csv")[:50]
tf = []

for experiment in experiments:
  for idx, row in df_all.iterrows():
    if "_A" in experiment:
      to_ask = (
      row["question"] +
      " A " + row["correct_answer"] +
      ", B " + row["distractor2"] +
      ", C " + row["distractor1"] +
      ", D " + row["distractor3"]
      )
    else:
      to_ask = (
        row["question"] +
        " A " + row["distractor3"] +
        ", B " + row["distractor2"] +
        ", C " + row["distractor1"] +
        ", D " + row["correct_answer"]
      )


    if "harder_" in experiment:
        context = (
            original_context +
            row["support"] +
            "".join([str(support) for support in harder_context["support"].tolist()])


        )

    elif "_middle" in experiment:
        context = (
            original_context +
            prev_rows[0] +
            row["support"] +
            prev_rows[1] )

    elif "_end" in experiment:
        context = (
            original_context +
            prev_rows[0] +
            prev_rows[1] +
            row["support"] )


    prev_rows.pop(0)
    prev_rows.append(row["support"])

    messages = [
      {
          "role": "system",
          "content": context,
      },
      {"role": "user", "content": to_ask},
      ]


    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    outputs = model.generate(tokenized_chat, max_new_tokens=128)

    decoded_output = tokenizer.decode(outputs[0])
    output_lines = decoded_output.splitlines()

    if output_lines and output_lines[-1]:
        answer = output_lines[-1][0]
        df_all.loc[idx, experiment] = answer
    else:
        df_all.loc[idx, experiment] = ""
        tf.append(output_lines[-1])


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[1;30;43mStreaminguitvoer ingekort tot de laatste 5000 regels.[0m
position_end 7679
position_end 7680
position_end 7681
position_end 7682
position_end 7683
position_end 7684
position_end 7685
position_end 7686
position_end 7687
position_end 7688
position_end 7689
position_end 7690
position_end 7691
position_end 7692
position_end 7693
position_end 7694
position_end 7695
position_end 7696
position_end 7697
position_end 7698
position_end 7699
position_end 7700
position_end 7701
position_end 7702
position_end 7703
position_end 7704
position_end 7705
position_end 7706
position_end 7707
position_end 7708
position_end 7709
position_end 7710
position_end 7711
position_end 7712
position_end 7713
position_end 7714
position_end 7715
position_end 7716
position_end 7717
position_end 7718
position_end 7719
position_end 7720
position_end 7721
position_end 7722
position_end 7723
position_end 7724
position_end 7725
position_end 7726
position_end 7727
position_end 7728
position_end 7729
position_end 7

In [None]:

import pandas as pd
df = pd.DataFrame(df_all)
df.to_csv("extra_qwen_med_1.csv")


In [None]:
# EXTRA EXPERIMENTS AFTER EVALUATION!!!

experiments = ["position_middle_A", "position_end_A", "prev_20", "prev_20_A"]
prev_rows = [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
original_context = "You are a QA system that only answers with a singular letter as an answer"
harder_context = pd.read_csv("harder_context.csv")[:50]
tf = []

for experiment in experiments:
  for idx, row in df_all.iterrows():
    if "_A" in experiment:
      to_ask = (
      row["question"] +
      " A " + row["correct_answer"] +
      ", B " + row["distractor2"] +
      ", C " + row["distractor1"] +
      ", D " + row["distractor3"]
      )
    else:
      to_ask = (
        row["question"] +
        " A " + row["distractor3"] +
        ", B " + row["distractor2"] +
        ", C " + row["distractor1"] +
        ", D " + row["correct_answer"]
      )


    if "_middle" in experiment:
        context = (
            original_context +
            prev_rows[0] +
            row["support"] +
            prev_rows[1] )

    elif "_end" in experiment:
        context = (
            original_context +
            prev_rows[0] +
            prev_rows[1] +
            row["support"] )


    else:
        context = (
            original_context +
            row["support"] +
            prev_rows[0] +
            prev_rows[1] +
            prev_rows[2] +
            prev_rows[3] +
            prev_rows[4] +
            prev_rows[5] +
            prev_rows[6] +
            prev_rows[7] +
            prev_rows[8] +
            prev_rows[9] +
            prev_rows[10] +
            prev_rows[11] +
            prev_rows[12] +
            prev_rows[13] +
            prev_rows[14] +
            prev_rows[15] +
            prev_rows[16] +
            prev_rows[17] +
            prev_rows[18] +
            prev_rows[19]
        )



    prev_rows.pop(0)
    prev_rows.append(row["support"])

    messages = [
      {
          "role": "system",
          "content": context,
      },
      {"role": "user", "content": to_ask},
      ]


    tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
    outputs = model.generate(tokenized_chat, max_new_tokens=128)

    decoded_output = tokenizer.decode(outputs[0])
    output_lines = decoded_output.splitlines()

    if output_lines and output_lines[-1]:
        answer = output_lines[-1][0]
        df_all.loc[idx, experiment] = answer
    else:
        df_all.loc[idx, experiment] = ""
        tf.append(output_lines[-1])


In [None]:

import pandas as pd
df = pd.DataFrame(df_all)
df.to_csv("extra_qwen_med_2.csv")
