In [2]:
import transformers     # only for the tokenizer
import tensorflow as tf
import jsonlines
from tqdm import tqdm

In [3]:
tokenizer = transformers.LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")

def txt_to_tokens(txt: str) -> list[int]:
    return tokenizer(txt, return_tensors="tf").input_ids.numpy().tolist()[0]

def tokens_to_txt(tokens: list[int]) -> str:
    return tokenizer.decode(tokens)

In [4]:
texts = []

n_to_load = 1000
with jsonlines.open("./data/train.jsonl") as reader:
    for i, obj in enumerate(reader):
        if i >= n_to_load:
            break
        texts.append(
            (obj["prompt"], obj["text"])
        )



In [5]:
X = []
Y = []

for prompt, text in tqdm(texts):
    prompt_tokens = txt_to_tokens(prompt)
    text_tokens = txt_to_tokens(text)

    for i in range(len(text_tokens) - 1):
        X.append(prompt_tokens + text_tokens[:i])
        Y.append(text_tokens[i])

100%|██████████| 1000/1000 [00:22<00:00, 44.67it/s]


In [7]:
def get_model(
        n_input_size: int,
        n_layers: int,
        n_hidden_size: int
        ) -> tf.keras.Model:
    
    input_shape = (n_input_size,)
    hidden_size = (n_hidden_size, n_hidden_size)
    output_shape = (n_input_size,)

    model = tf.keras.Sequential()

    model.add(tf.keras.layers.InputLayer(input_shape=input_shape))

    for _ in range(n_layers):
        model.add(tf.keras.layers.Dense(n_hidden_size, activation="relu"))
    
    model.add(tf.keras.layers.Dense(n_input_size, activation="softmax"))

    return model


In [12]:
def train(model: tf.keras.Model, x_train: list[list[int]], y_train: list[int], n_epochs: int, batch_size: int, lr: float):
    # x_train: list of tokenized text
    # y_train: list of the next token in the text

    optim = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optim, loss="sparse_categorical_crossentropy")

    for _ in tqdm(range(n_epochs)):
        model.fit(x_train, y_train, batch_size=batch_size)
        

In [9]:
def predict(model: tf.keras.Model, txt: str, n: int) -> str:
    tokens_in = txt_to_tokens(txt)
    tokens_out = []
    for _ in range(n):
        tokens_in = model.predict(tokens_in)
        tokens_out.append(tokens_in)
    return tokens_to_txt(tokens_out)


In [10]:
n_input_size = 512
n_layers = 24
n_hidden_size = 2048
lr = 6e-4

In [11]:
tf.keras.backend.clear_session()
model = get_model(n_input_size, n_layers, n_hidden_size)
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 2048)              1050624   
                                                                 
 dense_1 (Dense)             (None, 2048)              4196352   
                                                                 
 dense_2 (Dense)             (None, 2048)              4196352   
                                                                 
 dense_3 (Dense)             (None, 2048)              4196352   
                                                                 
 dense_4 (Dense)             (None, 2048)              4196352   
                                                                 
 dense_5 (Dense)             (None, 2048)              4196352   
                                                                 
 dense_6 (Dense)             (None, 2048)              4

In [13]:
train(model, X, Y, n_epochs=1, batch_size=32, lr=lr)

  0%|          | 0/1 [00:00<?, ?it/s]