This notebook includes one classifier and two generators. It’s just a test version, not the final solution. To run it, need a folder called *dataset_jsonl* containing *test.jsonl*, *valid.jsonl*, and *train.jsonl*.

In [1]:
!uv pip install -U keras keras-hub tensorflow


[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m43 packages[0m [2min 538ms[0m[0m
[2mAudited [1m43 packages[0m [2min 1ms[0m[0m


In [2]:
import tensorflow as tf
tf.config.list_physical_devices()


  if not hasattr(np, "object"):


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

This notebook runs on Google Colab, so it need to mount Google Drive, ignore it if running it locally


In [3]:
from google.colab import drive
drive.mount("/content/drive")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


1model for classifier, 2 models for generator

In [4]:
import os, json, random
import numpy as np
import tensorflow as tf
import keras
import keras_hub
import json, os

In [5]:
dir_data = "/content/drive/MyDrive/dataset_jsonl"
train_js = os.path.join(dir_data, "train.jsonl")
valid_js = os.path.join(dir_data, "valid.jsonl")
test_js  = os.path.join(dir_data, "test.jsonl")

dir_out = "generator_3models"
os.makedirs(dir_out, exist_ok=True)

SEED = 1234
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

max_len = 256
batch_size = 16
epochs_cls = 3
epochs_gen = 3
lr = 3e-4

d_model= 128
num_heads= 2
num_layers = 4
ff_dim = 512
dropout = 0.1

PAD, UNK, BOS, EOS = 0, 1, 2, 3
special = ["[PAD]", "[UNK]", "[BOS]", "[EOS]"]


In [6]:
def iter_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)


In [7]:
from tensorflow.keras.layers import TextVectorization

In [8]:
def custom_standardize(s):
    s = tf.strings.regex_replace(s, r"[ \t]", "")
    s = tf.strings.regex_replace(s, "　", "")
    return s

def make_text_dataset(jsonl_paths):
    def gen():
        for p in jsonl_paths:
            for x in iter_jsonl(p):
                yield (x["prompt"] + x["target"])
    return tf.data.Dataset.from_generator(
        gen, output_signature=tf.TensorSpec(shape=(), dtype=tf.string)
    )


In [9]:
tmp_vec = TextVectorization(
    standardize=custom_standardize,
    split="character",
    output_mode="int",
)

text_ds = make_text_dataset([train_js, valid_js])
tmp_vec.adapt(text_ds)

# the first 2 params of 'tmp_vec.get_vocabulary()' ：
# idx0: ""(padding), idx1: "[UNK]"(OOV)
base_chars = tmp_vec.get_vocabulary()[2:]

FINAL_VOCAB = ["[BOS]", "[EOS]"] + base_chars

vec = TextVectorization(
    standardize=custom_standardize,
    split="character",
    output_mode="int",
    vocabulary=FINAL_VOCAB,
    ragged=True
)

VOCAB = vec.get_vocabulary()
vocab_size = len(VOCAB)
token2id = {t:i for i,t in enumerate(VOCAB)}
PAD, UNK = 0, 1
BOS, EOS = 2, 3

with open(os.path.join(dir_out, "vocab.json"), "w", encoding="utf-8") as f:
    json.dump({"vocab": FINAL_VOCAB}, f, ensure_ascii=False)

print("vocab_size =", len(vec.get_vocabulary()))

vocab_size = 4987


In [10]:
def encode_text_tv(text: str) -> list[int]:
    ids = vec(tf.constant([text])).values.numpy().tolist()
    return ids

In [11]:
def decode_ids_tv(ids):
    out = []
    for i in ids:
        if i == PAD or i == BOS:
            continue
        if i == EOS:
            break
        out.append(VOCAB[i])
    return "".join(out)

In [12]:
def pad_to_max(ids, max_len):
    ids = ids[:max_len]
    ids = ids + [PAD] * (max_len - len(ids))
    return np.array(ids, dtype=np.int32)

In [13]:
def make_cls_example(prompt, label):
    prompt_ids = encode_text_tv(prompt)

    seq = [BOS] + prompt_ids + [EOS]
    seq = seq[:max_len]
    if len(seq) > 0:
        seq[-1] = EOS

    token_ids = pad_to_max(seq, max_len)
    prompt_last = min(len(prompt_ids), max_len - 1)

    return token_ids, np.int32(label), np.int32(prompt_last)

def make_cls_dataset(jsonl_path, training=True):
    def gen():
        for x in iter_jsonl(jsonl_path):
            label = int(x["type"])  # 0/1
            token_ids, y, prompt_last = make_cls_example(x["prompt"], label)
            yield token_ids, y, prompt_last

    sig = (
        tf.TensorSpec((max_len,), tf.int32),
        tf.TensorSpec((), tf.int32),
        tf.TensorSpec((), tf.int32),
    )
    ds = tf.data.Dataset.from_generator(gen, output_signature=sig)

    def pack(token_ids, y, prompt_last):
        return {"token_ids": token_ids, "prompt_last": prompt_last}, y

    ds = ds.map(pack, num_parallel_calls=tf.data.AUTOTUNE)
    if training:
        ds = ds.shuffle(2048)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

cls_train = make_cls_dataset(train_js, training=True)
cls_valid = make_cls_dataset(valid_js, training=False)


In [14]:
def make_lm_example(prompt, target):
    prompt_ids = encode_text_tv(prompt)
    target_ids = encode_text_tv(target)

    seq = [BOS] + prompt_ids + target_ids + [EOS]
    seq = seq[:max_len]
    if len(seq) > 0:
        seq[-1] = EOS

    token_ids = pad_to_max(seq, max_len)

    lm = seq[1:] + [PAD]
    lm = lm[:max_len]
    lm_labels = pad_to_max(lm, max_len)
    prompt_end = 1 + len(prompt_ids)
    w = [0.0] * (prompt_end - 1) + [1.0] * (len(seq) - (prompt_end - 1))
    lm_w = pad_to_max(w, max_len).astype("float32")

    return token_ids, lm_labels, lm_w


In [15]:
def make_gen_dataset(jsonl_path, target_type, training=True):
    def gen():
        for x in iter_jsonl(jsonl_path):
            if int(x["type"]) != int(target_type):
                continue
            token_ids, lm_labels, lm_w = make_lm_example(x["prompt"], x["target"])
            yield token_ids, lm_labels, lm_w

    sig = (
        tf.TensorSpec((max_len,), tf.int32),
        tf.TensorSpec((max_len,), tf.int32),
        tf.TensorSpec((max_len,), tf.float32),
    )
    ds = tf.data.Dataset.from_generator(gen, output_signature=sig)

    def pack(token_ids, lm_labels, lm_w):
        return token_ids, lm_labels, lm_w

    ds = ds.map(pack, num_parallel_calls=tf.data.AUTOTUNE)
    if training:
        ds = ds.shuffle(2048)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds


In [16]:
gen0_train = make_gen_dataset(train_js, 0, True)
gen0_valid = make_gen_dataset(valid_js, 0, False)

gen1_train = make_gen_dataset(train_js, 1, True)
gen1_valid = make_gen_dataset(valid_js, 1, False)

In [17]:
def build_classifier(vocab_size):
    token_ids = keras.Input((max_len,), dtype="int32", name="token_ids")
    prompt_last = keras.Input((), dtype="int32", name="prompt_last")

    pad_mask = keras.ops.not_equal(token_ids, PAD)

    tok_emb = keras.layers.Embedding(vocab_size, d_model)
    pos_emb = keras.layers.Embedding(max_len, d_model)
    positions = keras.ops.arange(0, max_len)[None, :]
    x = tok_emb(token_ids) + pos_emb(positions)
    x = keras.layers.Dropout(dropout)(x)

    for _ in range(num_layers):
        x = keras_hub.layers.TransformerDecoder(
            intermediate_dim=ff_dim,
            num_heads=num_heads,
            dropout=dropout,
            normalize_first=True,
        )(x, decoder_padding_mask=pad_mask, use_causal_mask=False)

    h = keras.ops.take_along_axis(
        x,
        keras.ops.expand_dims(keras.ops.expand_dims(prompt_last, -1), -1),
        axis=1
    )
    h = keras.ops.squeeze(h, axis=1)
    logits = keras.layers.Dense(2)(h)

    return keras.Model({"token_ids": token_ids, "prompt_last": prompt_last}, logits)

clf = build_classifier(vocab_size)
clf.compile(
    optimizer=keras.optimizers.AdamW(lr),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)
clf.summary()


In [18]:
ckpt_clf = os.path.join(dir_out, "classifier.keras")
cb = keras.callbacks.ModelCheckpoint(ckpt_clf, monitor="val_loss", save_best_only=True)
clf.fit(cls_train, validation_data=cls_valid, epochs=epochs_cls, callbacks=[cb])
print("saved:", ckpt_clf)


Epoch 1/3
    109/Unknown [1m54s[0m 152ms/step - acc: 0.8665 - loss: 0.5393

  if self._should_save_model(epoch, batch, logs, filepath):


[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 161ms/step - acc: 0.9529 - loss: 0.1833
Epoch 2/3
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 21ms/step - acc: 0.9989 - loss: 0.0040
Epoch 3/3
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 30ms/step - acc: 1.0000 - loss: 2.7060e-05
saved: generator_3models/classifier.keras


In [19]:
def build_generator(vocab_size):
    token_ids = keras.Input((max_len,), dtype="int32", name="token_ids")
    pad_mask = keras.ops.not_equal(token_ids, PAD)

    tok_emb = keras.layers.Embedding(vocab_size, d_model)
    pos_emb = keras.layers.Embedding(max_len, d_model)
    positions = keras.ops.arange(0, max_len)[None, :]
    x = tok_emb(token_ids) + pos_emb(positions)
    x = keras.layers.Dropout(dropout)(x)

    for _ in range(num_layers):
        x = keras_hub.layers.TransformerDecoder(
            intermediate_dim=ff_dim,
            num_heads=num_heads,
            dropout=dropout,
            normalize_first=True,
        )(x, decoder_padding_mask=pad_mask, use_causal_mask=True)

    lm_logits = keras.layers.Dense(vocab_size)(x)
    return keras.Model(token_ids, lm_logits)

def compile_generator(m):
    m.compile(
        optimizer=keras.optimizers.AdamW(lr),
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    )
    return m

gen0 = compile_generator(build_generator(vocab_size))
gen1 = compile_generator(build_generator(vocab_size))


In [20]:
ckpt_g0 = os.path.join(dir_out, "gen_type0.keras")
ckpt_g1 = os.path.join(dir_out, "gen_type1.keras")

cb0 = keras.callbacks.ModelCheckpoint(ckpt_g0, monitor="val_loss", save_best_only=True)
cb1 = keras.callbacks.ModelCheckpoint(ckpt_g1, monitor="val_loss", save_best_only=True)

gen0.fit(gen0_train, validation_data=gen0_valid, epochs=epochs_gen, callbacks=[cb0])
print("saved:", ckpt_g0)

gen1.fit(gen1_train, validation_data=gen1_valid, epochs=epochs_gen, callbacks=[cb1])
print("saved:", ckpt_g1)


Epoch 1/3
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 187ms/step - loss: 0.0000e+00
Epoch 2/3
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 30ms/step - loss: 0.0000e+00
Epoch 3/3
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 27ms/step - loss: 0.0000e+00
saved: generator_3models/gen_type0.keras
Epoch 1/3
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2s/step - loss: 0.0000e+00
Epoch 2/3
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 67ms/step - loss: 0.0000e+00
Epoch 3/3
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 62ms/step - loss: 0.0000e+00
saved: generator_3models/gen_type1.keras


In [21]:
def classify_type(prompt_text):
    p = encode_text_tv(prompt_text)
    seq = [BOS] + p + [EOS]
    if len(seq) > max_len:
        seq = seq[:max_len]; seq[-1]=EOS
    prompt_last = min(len(p), max_len-1)
    x = {"token_ids": pad_to_max(seq, max_len)[None, :],
         "prompt_last": np.array([prompt_last], np.int32)}
    logits = clf.predict(x, verbose=0)[0]
    return int(np.argmax(logits))

In [26]:
def _softmax(x):
    x = x - np.max(x)
    p = np.exp(x)
    return p / np.sum(p)

def _sample_top_p(probs, top_p=0.9):
    idx = np.argsort(probs)[::-1]
    p_sorted = probs[idx]
    cumsum = np.cumsum(p_sorted)
    cut = np.searchsorted(cumsum, top_p) + 1
    keep = idx[:cut]
    p = probs[keep]
    p = p / p.sum()
    return int(np.random.choice(keep, p=p))

def generate_topp_no_nl(gen_model, prompt_text, max_new=200,
                        temperature=0.9, top_p=0.9,
                        min_new_tokens=40, ban_nl_steps=200):
    p = encode_text_tv(prompt_text)
    ids = [BOS] + p
    start_len = len(ids)

    if start_len >= max_len - 2:
        ids = ids[:max_len - 2]
        start_len = len(ids)

    nl_id = token2id.get("\n", None)
    for step in range(max_new):
        x = pad_to_max(ids, max_len)[None, :]
        logits = gen_model.predict(x, verbose=0)[0]
        pos = min(len(ids) - 1, max_len - 1)
        vec = logits[pos].astype(np.float64) / max(1e-6, temperature)

        if step < min_new_tokens:
            vec[EOS] = -1e30

        if nl_id is not None and step < ban_nl_steps:
            vec[nl_id] = -1e30

        probs = _softmax(vec)
        next_id = _sample_top_p(probs, top_p=top_p)

        ids.append(next_id)
        if next_id == EOS or len(ids) >= max_len:
            break

    return decode_ids_tv(ids[start_len:])


In [27]:
def pipeline(prompt_text, max_new=200):
    t = classify_type(prompt_text)
    gen_model = gen0 if t == 0 else gen1
    cont = generate_topp_no_nl(gen_model, prompt_text, max_new=max_new)
    return t, cont

In [28]:
prompt_text = "雄無所爭固可想像其勢髣髴其形若乃足縈虹蜺 目耀日月連軒沓拖揮霍翕忽噴氣則六合生雲灑 毛則千里飛雪邈彼北荒將窮南圖運逸翰以傍擊"

In [32]:
t, cont = pipeline(prompt_text, 120)
print("pred_type =", t)
print("repr(cont[:120]) =", repr(cont[:30]))
print(cont)

pred_type = 0
repr(cont[:120]) = '寇昵螂宓鮎費還墮棠孝吠萱翮賀畱嶸萬杏柟里買瀝媚羮艱禋楸衛賣篠'
寇昵螂宓鮎費還墮棠孝吠萱翮賀畱嶸萬杏柟里買瀝媚羮艱禋楸衛賣篠㝷娑酩遥瑩幄証蒙薖分勃坌冐嘆聒梯收畍諾諾電網迷榼懿柚纎篷渢蠖弱蹶瀧鉞淙猖襄鉶往矣頻陋躙瞭一徬愈非革憔悔效修礪狴笛沃倍矛窰䥴四急漾誌㶑燥望規漂㢲鏁竈腆諌學洩膚遯鉞涘條禦生永潨僮逝仍需
