In [122]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import re
import time
import os
import fitz

In [138]:
from datasets import load_dataset

def load_nynorsk_sentences(limit=10000):
    dataset = load_dataset("NbAiLab/NCC", streaming=True)
    train_stream = dataset["train"]
    sentences = []
    for example in train_stream:
        if example.get("lang_fasttext") == "nn":
            text = example["text"]
            for s in re.split(r"[.!?]\\s+", text):
                if len(s.split()) > 3:
                    sentences.append(s.strip())
        if len(sentences) >= limit:
            break
    return sentences

pretrain_sentences = load_nynorsk_sentences()


def extract_text_from_pdf_folder(folder_path):
    all_text = ""
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            path = os.path.join(folder_path, filename)
            with fitz.open(path) as doc:
                for page in doc:
                    all_text += page.get_text()
    return all_text

fjord1_text = extract_text_from_pdf_folder("../../fjord1_pdfs")

# Basic preprocessing
fjord1_text = re.sub(r"[^\wåøæÅØÆ.,!?\s]", "", fjord1_text)
fjord1_sentences = [s.strip() for s in re.split(r'[.!?]\s+', fjord1_text) if len(s.split()) > 3]

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

all_sentences = pretrain_sentences + fjord1_sentences
tokenizer = Tokenizer(filters='', oov_token="<OOV>")
tokenizer.fit_on_texts(all_sentences)
sequences = tokenizer.texts_to_sequences(all_sentences)
vocab_size = len(tokenizer.word_index) + 1
print (f"Vocabulary size: {vocab_size}")

Vocabulary size: 399770


In [None]:

def prepare_sequences(sentences, tokenizer):
    sequences = tokenizer.texts_to_sequences(sentences)
    X, y = [], [] 
    for seq in sequences:
        if len(seq) > 50:
            seq = seq[:50]
            for i in range(1, len(seq)):
                X.append(seq[:i])
                y.append(seq[i])
        X = pad_sequences(X, padding="pre")
        y = np.array(y)
        return X, y, X.shape[1]

def sequence_generator(sentences, tokenizer, maxlen_limit=50):
    for sentence in sentences:
        seq = tokenizer.texts_to_sequences([sentence])[0]
        if len(seq) < 2:
            continue
        if len(seq) > maxlen_limit:
            seq = seq[:maxlen_limit]
        for i in range(1, len(seq)):
            x = seq[:i]
            y = seq[i]
            x = tf.keras.preprocessing.sequence.pad_sequences([x], maxlen=maxlen_limit, padding="pre")[0]
            yield x, y

maxlen = 50

pretrain_dataset = tf.data.Dataset.from_generator(
    lambda: sequence_generator(pretrain_sentences, tokenizer, maxlen),
    output_signature=(
        tf.TensorSpec(shape=(maxlen,), dtype=tf.int32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
    )
).shuffle(2048).batch(64).prefetch(tf.data.AUTOTUNE)

fjord1_dataset = tf.data.Dataset.from_generator(
    lambda: sequence_generator(fjord1_sentences, tokenizer, maxlen),
    output_signature=(
        tf.TensorSpec(shape=(maxlen,), dtype=tf.int32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
    )
).shuffle(2048).batch(64).prefetch(tf.data.AUTOTUNE)
X_pre, y_pre, maxlen = prepare_sequences(pretrain_sentences, tokenizer)
pretrain_dataset = tf.data.Dataset.from_tensor_slices((X_pre, y_pre)).shuffle(2048).batch(64).prefetch(tf.data.AUTOTUNE)

# Fine-tuning on Fjord1 documents
X_fjord1, y_fjord1, _ = prepare_sequences(fjord1_sentences, tokenizer)
fjord1_dataset = tf.data.Dataset.from_tensor_slices((X_fjord1, y_fjord1)).shuffle(2048).batch(64).prefetch(tf.data.AUTOTUNE)

In [141]:
class PositionalEmbedding(keras.layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = keras.layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = keras.layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

        self.not_equal = keras.layers.Lambda(lambda x: tf.math.not_equal(x, 0))

    def call(self, inputs):
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(self.positions)
        return embedded_tokens + embedded_positions

    def build(self, input_shape):
        length = input_shape[-1]
        self.positions = tf.range(start=0, limit=length, delta=1)

    def compute_mask(self, inputs, mask=None):
        return self.not_equal(inputs)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config


class TransformerDecoder(keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = keras.layers.MultiHeadAttention(
          num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = keras.layers.MultiHeadAttention(
          num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [keras.layers.Dense(dense_dim, activation="relu"),
             keras.layers.Dense(embed_dim),]
        )
        self.layernorm_1 = keras.layers.LayerNormalization()
        self.layernorm_2 = keras.layers.LayerNormalization()
        self.layernorm_3 = keras.layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super(TransformerDecoder, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        else:
            padding_mask = mask
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

In [157]:
def create_transformer_model(vocab_size, maxlen, embed_dim=64, num_heads=2, ff_dim=128):
    inputs = layers.Input(shape=(maxlen,))
    x = PositionalEmbedding(maxlen, vocab_size, embed_dim)(inputs)
    x = TransformerDecoder(embed_dim, ff_dim, num_heads)(x, x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(20, activation="relu")(x)
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(vocab_size, activation="softmax", dtype="float32")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="rmsprop", loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])
    return model

In [158]:
model = create_transformer_model(vocab_size, maxlen)
model.summary()

print("\n--- Pretraining on Open Domain Norwegian ---")
model.fit(pretrain_dataset, epochs=20)

print("\n--- Fine-tuning on Fjord1 Corpus ---")
model.fit(fjord1_dataset, epochs=50)



--- Pretraining on Open Domain Norwegian ---
Epoch 1/20


  output, from_logits = _get_logits(


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 12.8976
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step - accuracy: 0.0204 - loss: 12.8816
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step - accuracy: 0.0204 - loss: 12.8245
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - accuracy: 0.1429 - loss: 12.7210
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - accuracy: 0.1020 - loss: 12.5778
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 0.1633 - loss: 12.4522
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 0.1224 - loss: 12.3264
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 0.0612 - loss: 12.1866
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x46a960350>

In [163]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def generate_text(prompt, tokenizer, model, maxlen, num_tokens=10, top_k=10, temperature=1, eos_token='<eos>', seed=None):
    if seed is not None:
        np.random.seed(seed)

    tokens = tokenizer.texts_to_sequences([prompt])[0]
    eos_id = tokenizer.word_index.get(eos_token)

    for _ in range(num_tokens):
        padded = pad_sequences([tokens], maxlen=maxlen, padding='pre')
        logits = model.predict(padded, verbose=0)[0]

        # Apply temperature scaling
        scaled_logits = logits / temperature
        probs = np.exp(scaled_logits) / np.sum(np.exp(scaled_logits))

        # Top-k sampling
        top_k_indices = np.argsort(probs)[-top_k:]
        top_k_probs = probs[top_k_indices]
        top_k_probs /= np.sum(top_k_probs)
        next_token = np.random.choice(top_k_indices, p=top_k_probs)
        tokens.append(next_token)

        # Stop if EOS token is generated
        if eos_id is not None and next_token == eos_id:
            break

    return tokenizer.sequences_to_texts([tokens])[0]

# Example generation:
print(generate_text("Fjord1 har si kjerneverksemd innan", tokenizer, model, maxlen, seed=42))


fjord1 har si kjerneverksemd innan våre våre og fjord1 ruteoversikt bord bord 
samfunnsansvar
openheitslova om ruteoversikt fjord1


In [152]:
def top_10_predictions(prompt, tokenizer, model, maxlen):
    tokens = tokenizer.texts_to_sequences([prompt])[0]
    padded = pad_sequences([tokens], maxlen=maxlen, padding='pre')
    probs = model.predict(padded, verbose=0)[0]
    top_indices = np.argsort(probs)[-10:][::-1]
    top_words = [tokenizer.index_word[idx] for idx in top_indices]
    top_scores = [probs[idx] for idx in top_indices]
    return list(zip(top_words, top_scores))

print(top_10_predictions("fjord1 har si kjerneverksemd innan om", tokenizer, model, maxlen))

[('om', np.float32(0.07606093)), ('fjord1', np.float32(0.060858406)), ('bord', np.float32(0.04291809)), ('servering', np.float32(0.042879734)), ('trafikkmeldingar', np.float32(0.042244747)), ('kundesider', np.float32(0.04201583)), ('ruteoversikt', np.float32(0.04181237)), ('våre', np.float32(0.04124559)), ('søk', np.float32(0.021667471)), ('visjon,', np.float32(0.021334145))]
