# Pre-tokenization & Processing

## Import

In [16]:
import os
import re
import functools
from colorama import Fore, Style
import textwrap
from jaxtyping import Float
import einops

import numpy as np
import pandas as pd

import torch
import transformer_lens
from sae_lens import SAE, PretokenizeRunner, PretokenizeRunnerConfig
from transformers import GPTNeoXForCausalLM, AutoTokenizer, AutoModelForCausalLM

from huggingface_hub import whoami, login, notebook_login, HfApi
from datasets import load_dataset, concatenate_datasets

import json
from tqdm import tqdm
from transformer_lens import HookedTransformer

import requests
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import io

from jaxtyping import Int
from torch import Tensor
from typing import List, Callable
from transformers import AutoTokenizer

In [27]:
from data_tools.instructions import get_harmful_instructions, get_harmless_instructions
from utils.templates import PYTHIA_TEMPLATE
from utils.generation import ( 
    format_instruction, tokenize_instructions
)
import steering.linear_probing as lp_steer
import refusal.linear_probing as lp_refuse
from evaluation.refusal import (
    get_refusal_scores, get_wildguard_refusal_score
)
from config import config

## Settings

In [17]:
!export HF_HOME=/share/tilman.kerl/huggingface

In [28]:
login(config.credentials.hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [18]:
hf_api = HfApi(token="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")

In [14]:
BASE_MODEL_NAME = "EleutherAI/pythia-410m-deduped"
INSTRUCT_MODEL_NAME = "SummerSigh/Pythia410m-V0-Instruct"

### Helper Functions

In [23]:
def pretokenize_lmsys(model, repo_name):
    repo_name = f"lmsys-chat-1m-english-tokenized-{repo_name}"
    lmsys_cfg = PretokenizeRunnerConfig(
        tokenizer_name=model,
        dataset_path="json",
        data_dir="lmsys",
        data_files={"train": "lmsys-processed.jsonl"}, 
        shuffle=True,
        num_proc=4,  
        
        context_size=64,  # Adjust based on model requirements
        begin_batch_token="bos",
        begin_sequence_token=None,
        sequence_separator_token="eos",
    
        # save the dataset locally
        save_path=f"/share/tilman.kerl/{repo_name}"
    )
    
    lmsys_pretok = PretokenizeRunner(lmsys_cfg).run()

    hf_api.upload_folder(
        folder_path=f"/share/tilman.kerl/{repo_name}",
        repo_id=f"MisterXY89/{repo_name}",
        repo_type="dataset",
    )

    return lmsys_pretok

SyntaxError: incomplete input (3600213305.py, line 1)

In [None]:
def pretokenize_pile(model, repo_name):
    repo_name = f"timaeus-pile-cc-tokenized-{repo_name}"
    pile_cfg = PretokenizeRunnerConfig(
        tokenizer_name=model,        
        dataset_path="/share/ilya.lasy/huggingface/datasets/timaeus___pile-pile-cc",
        shuffle=True,
        num_proc=4,  
        
        context_size=64,
        begin_batch_token="bos",
        begin_sequence_token=None,
        sequence_separator_token="eos",
    
        # save the dataset locally
        save_path=f"/share/tilman.kerl/{repo_name}"
    )
    
    pile_pretok = PretokenizeRunner(pile_cfg).run()

    hf_api.upload_folder(
        folder_path=f"/share/tilman.kerl/{repo_name}",
        repo_id=f"MisterXY89/{repo_name}",
        repo_type="dataset",
    )

    return pile_pretok

## LM-SYS

### Processing

Loading & filtering

In [3]:
lmsys_ds = load_dataset("lmsys/lmsys-chat-1m", split="train")

In [4]:
lmsys_ds = lmsys_ds.filter(lambda x: x.get("language") == "English")

Filter:   0%|          | 0/1000000 [00:00<?, ? examples/s]

KeyboardInterrupt: 

Since PretokenizeRunner expects a local dataset path, save the filtered dataset:

In [19]:
def flatten_conversation(ex):
    flow = []
    for turn in ex["conversation"]:
        speaker = turn.get("role", "").upper()
        # use 'content' instead of 'value'
        flow.append(f"{speaker}: {turn['content']}")
    return {"text": "\n".join(flow)}

In [10]:
lmsys_ds[0]

{'conversation_id': '33f01939a744455c869cb234afca47f1',
 'model': 'wizardlm-13b',
 'conversation': [{'content': 'how can identity protection services help protect me against identity theft',
   'role': 'user'},
  {'content': "Identity protection services can help protect you against identity theft in several ways:\n\n1. Monitoring: Many identity protection services monitor your credit reports, public records, and other sources for signs of identity theft. If they detect any suspicious activity, they will alert you so you can take action.\n2. Credit freeze: Some identity protection services can help you freeze your credit, which makes it more difficult for thieves to open new accounts in your name.\n3. Identity theft insurance: Some identity protection services offer insurance that can help you recover financially if you become a victim of identity theft.\n4. Assistance: Many identity protection services offer assistance if you become a victim of identity theft. They can help you file a

In [12]:
lmsys_ds = lmsys_ds.map(
    flatten_conversation,
    remove_columns=lmsys_ds.column_names,
)

Map:   0%|          | 0/777453 [00:00<?, ? examples/s]

In [16]:
# write out JSONL
os.makedirs("lmsys", exist_ok=True)
with open("lmsys/lmsys-processed.jsonl", "w", encoding="utf-8") as f:
    for ex in lmsys_ds:
        f.write(json.dumps({"text": ex["text"]}, ensure_ascii=False) + "\n")

### Pre-Tokenizing

In [None]:
pretokenize_lmsys("HuggingFaceTB/SmolLM2-135M", "smollm-135M")
# pretokenize_lmsys("EleutherAI/pythia-410m-deduped", "pythia-410M")

In [21]:
lmsys_cfg = PretokenizeRunnerConfig(
    tokenizer_name="HuggingFaceTB/SmolLM2-135M",
    dataset_path="json",
    data_dir="lmsys",
    data_files={"train": "lmsys-processed.jsonl"}, 
    shuffle=True,
    num_proc=4,  
    
    context_size=64,  # Adjust based on model requirements
    begin_batch_token="bos",
    begin_sequence_token=None,
    sequence_separator_token="eos",

    # Uncomment to upload to Hugging Face
    hf_repo_id="MisterXY89/lmsys-chat-1m-english-tokenized-smollm-135M",

    # Uncomment to save the dataset locally
    save_path="/share/tilman.kerl/lmsys-chat-1m-english-tokenized-smollm-135M"
)

In [22]:
# Run the pretokenization process
lmsys_pretok = PretokenizeRunner(lmsys_cfg).run()

Generating train split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/777453 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/6661644 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/64 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/105 [00:00<?, ?ba/s]

In [24]:
hf_api.upload_folder(
    folder_path="/share/tilman.kerl/lmsys-chat-1m-english-tokenized-smollm-135M",
    repo_id="MisterXY89/lmsys-chat-1m-english-tokenized-smollm-135M",
    repo_type="dataset",
)

data-00000-of-00004.arrow:   0%|          | 0.00/433M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

data-00001-of-00004.arrow:   0%|          | 0.00/433M [00:00<?, ?B/s]

data-00002-of-00004.arrow:   0%|          | 0.00/433M [00:00<?, ?B/s]

data-00003-of-00004.arrow:   0%|          | 0.00/433M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/MisterXY89/lmsys-chat-1m-english-tokenized-smollm-135M/commit/0dcb6de94fab45aa7720770fb9cd33ebaafb35ad', commit_message='Upload folder using huggingface_hub', commit_description='', oid='0dcb6de94fab45aa7720770fb9cd33ebaafb35ad', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/MisterXY89/lmsys-chat-1m-english-tokenized-smollm-135M', endpoint='https://huggingface.co', repo_type='dataset', repo_id='MisterXY89/lmsys-chat-1m-english-tokenized-smollm-135M'), pr_revision=None, pr_num=None)

## The Pile

### Processing

In [3]:
ds = load_dataset("/share/ilya.lasy/huggingface/datasets/timaeus___pile-pile-cc")

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'meta'],
        num_rows: 100000
    })
})

### Pre-tokenization

In [22]:
pretok_pythia_pile = pretokenize_pile("EleutherAI/pythia-410m-deduped", "pythia-410M")
pretok_smollm_pile = pretokenize_pile("HuggingFaceTB/SmolLM2-135M", "smollm-135M")

## MIX 

In [37]:
# hf paths
pre_hf_path = "MisterXY89/timaeus-pile-cc-tokenized-pythia-410m"
ins_hf_path = "MisterXY89/lmsys-chat-1m-english-tokenized-pythia410m"

In [38]:
def create_mixed_dataset(pre_ds: str,
                         ins_ds: str,
                         ratio_pre: float,
                         ratio_ins: float = None,
                         split: str = "train",
                         seed: int = 42):
    """
    Load two pre-tokenized HuggingFace datasets and return a mixed Dataset with the specified ratio.

    Args:
        pre_ds (str): HF path to the pre-training dataset.
        ins_ds (str): HF path to the instruction dataset.
        ratio_pre (float): Proportion of pre-training data in the mix (e.g., 0.3 for 30%).
        ratio_ins (float, optional): Proportion of instruction data in the mix.
                                     If None, set to 1 - ratio_pre.
        split (str): Which dataset split to load (default: "train").
        seed (int): Random seed for shuffling.

    Returns:
        datasets.Dataset: A mixed and shuffled dataset containing both sources.
    """
    if ratio_ins is None:
        ratio_ins = 1.0 - ratio_pre

    # Load and shuffle both datasets
    ds_pre = load_dataset(pre_ds)
    ds_ins = load_dataset(ins_ds, split=split)
    ds_pre = ds_pre.shuffle(seed=seed)
    ds_ins = ds_ins.shuffle(seed=seed)

    # Compute maximum total size to respect available examples
    max_total = min(len(ds_pre) / ratio_pre, len(ds_ins) / ratio_ins)

    # Determine sample sizes
    size_pre = int(ratio_pre * max_total)
    size_ins = int(ratio_ins * max_total)

    # Select subsets
    mixed_pre = ds_pre.select(range(size_pre))
    mixed_ins = ds_ins.select(range(size_ins))

    # Concatenate and shuffle final mixture
    mixed = concatenate_datasets([mixed_pre, mixed_ins])
    return mixed.shuffle(seed=seed)


In [40]:
# mixed_ds = create_mixed_dataset(pre_hf_path, ins_hf_path, ratio_pre=0.3)

In [42]:
pre = load_dataset(pre_hf_path, split="train")

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationCastError: An error occurred while generating the dataset

All the data files must have the same columns, but at some point there are 7 new columns ({'_fingerprint', '_format_columns', '_split', '_format_type', '_data_files', '_format_kwargs', '_output_all_columns'}) and 13 missing columns ({'original_column_name', 'sae_lens_version', 'seed', 'begin_sequence_token', 'original_dataset', 'original_split', 'original_data_files', 'tokenizer_name', 'shuffled', 'sequence_separator_token', 'begin_batch_token', 'context_size', 'original_dataset_name'}).

This happened while the json dataset builder was generating data using

hf://datasets/MisterXY89/timaeus-pile-cc-tokenized-pythia-410m/state.json (at revision 7f69017e8668fc0c13dbf58c49e2e681ce0ef8cf)

Please either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)

In [5]:
ds = load_dataset("MisterXY89/lmsys-chat-1m-english-tokenized-pythia410m")

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/417M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/417M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/417M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/417M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
len(ds["train"][0]["input_ids"])