In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/boolq-dataset/dev.jsonl
/kaggle/input/boolq-dataset/test.jsonl
/kaggle/input/boolq-dataset/train.jsonl


In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")


In [3]:
import os
import json
from torch.utils.data import Dataset

class BoolQDataset(Dataset):
    def __init__(self, base_dir, split):
        """
        Args:
            base_dir (str): Path to the base folder containing dataset splits.
            split (str): Dataset split to use ('train', 'test', or 'dev').
        """
        self.data_path = os.path.join(base_dir, f"{split}.jsonl")
        if not os.path.exists(self.data_path):
            raise FileNotFoundError(f"Dataset split file not found: {self.data_path}")
        self.data = self._load_data()

    def _load_data(self):
        """Loads data from the JSONL file."""
        data = []
        with open(self.data_path, 'r') as f:
            for idx, line in enumerate(f):  # Add idx while reading
                sample = json.loads(line)
                question = sample.get('question', None)
                passage = sample.get('passage', None)
                label = sample.get('answer', None)
                data.append({
                    'idx': idx,        # Add idx directly in the data
                    'question': question,
                    'passage': passage,
                    'label': label
                })
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            return [self._get_item(i) for i in range(*idx.indices(len(self)))]
        
        if isinstance(idx, int):
            if idx < 0 or idx >= len(self.data):
                raise IndexError(f"Index {idx} is out of range.")
        
        return self._get_item(idx)

    def _get_item(self, idx):
        sample = self.data[idx]
        return sample  # No need to extract 'idx' here since it's already part of the data

import random

class DatasetWrapper:

    def __init__(self, dataset_tag, base_dir, split):
        if dataset_tag == "boolq":
            self.dataset = BoolQDataset(
                base_dir=base_dir,
                split=split
            )

        elif dataset_tag == "gsm8k":
            self.dataset = GSM8KDataset(
                base_dir=base_dir,
                split=split
            )
            
        else:
            raise ValueError(f"Unsupported dataset_tag: {dataset_tag}")

    def __len__(self):
        return len(self.dataset)

    def get_dataset(self):
        return self.dataset
    
    def get_random_samples(self, num_samples, seed=None):
        """Get a list of random samples from the dataset."""
        if seed is not None:
            random.seed(seed)
            
        num_samples = min(num_samples, len(self.dataset))
        indices = random.sample(range(len(self.dataset)), num_samples)
        return [self.dataset[i] for i in indices]


In [7]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

models = {
    "qwen2.5_1.5b": "Qwen/Qwen2.5-1.5B-Instruct",
    "llama3.2_3b": "meta-llama/Llama-3.2-3B-Instruct",
}


class ModelWrapper:
    
    def __init__(self, model_name,secret_value_0):

        # Hyperparams
        self.max_new_tokens = 20
        self.temperature = 0.001

        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_auth_token=secret_value_0)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, use_auth_token=secret_value_0).to(device)

        self.pipe = pipeline(
            task="text-generation",
            device=device,
            model=self.model,
            tokenizer=self.tokenize
        )
    
    def generate(self, messages):
        """
        messages format:
        [
            {"role": "user", "content": "Who are you?"},
            ...
        ]
        """
        output = self.pipe(
            messages,
            max_new_tokens=self.max_new_tokens,
            temperature=self.temperature
        )

        return output


In [None]:
OUTPUT_FILENAME = "boolqllama"
dataset = DatasetWrapper("boolq", "/kaggle/input/boolq-dataset", "train").get_dataset()
model = ModelWrapper(models["llama3.2_3b"],secret_value_0)



tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

In [None]:
import csv
from tqdm import tqdm

csvfile = open(f"{OUTPUT_FILENAME}.csv", "w", newline='')
csv_writer = csv.writer(csvfile, delimiter=",")

columns = ["idx", "output"]
csv_writer.writerow(columns)

for data in tqdm(dataset[:]):

    question = data["question"]
    passage = data["passage"]

    prompt = (
        "You are given the following context:\n"
        f"\n{passage}\n"
        "Answer the given question as only 'true' or 'false':\n"
        f"{question}\n"
    )

    messages = [
        {"role": "user", "content": prompt}
    ]

    output = model.generate(messages)
    extracted_output = output[0]["generated_text"][-1]["content"]
    
    csv_writer.writerow([data["idx"], extracted_output])
    if data["idx"]%100 == 0:
        print(extracted_output)

    